Wednesday, May 26, 2010

a real hiragana to romanji function

Recently i had to find a solution for translating words written in hiragana into latin characters. I have to admit that i was disappointed with what i found. Basically those functions/programs were just doing a key/value lookup without taking account of some basic rules such :
 - "っ " like in "せってい" (setting) that is translated in "settei".
- Long vowel, actually there is different rules such as adding a "h" (american rule) or a circumflex accent on the vowel (french rule). However, since in general people write "とうきょう" "tokyo" and so on, i decided to just get read of the long vowel.

I know that i am a picky person, but when you work with named entities, it's a minimum to keep them as right as it should be.
The actual code is in PHP but can easily be ported to other languages or modified to convert katakana (were you expecting me to do your work ;)). enjoy !



<?php 
// should be the same a the script encoding
mb_regex_encoding("UTF-8");

function charSpliter($str) {
    $token = array();
    while (1) {
        $bytes = mb_ereg("[一-龠]|[ぁ-ん]|[ァ-ヴー]|[a-zA-Z0-9]|[a-zA-Z0-9]", $str, $match);
        if ($bytes === false) {
            break;
        } else {
            $match = $match[0];
            $token[] = $match;
        }
        $pos = strpos($str, $match);
        $str = substr($str, $pos + $bytes);
    }
    return $token;
}
function kana_2_romanji($kana_string) {
    $kana_list = array('a_h'=>'あ', 'i_h'=>'い', 'u_h'=>'う', 'e_h'=>'え', 'o_h'=>'お', 'ka_h'=>'か', 'ki_h'=>'き', 'ku_h'=>'く', 'ke_h'=>'け', 'ko_h'=>'こ', 'ga_h'=>'が', 'gi_h'=>'ぎ', 'gu_h'=>'ぐ', 'ge_h'=>'げ', 'go_h'=>'ご', 'sa_h'=>'さ', 'shi_h'=>'し', 'su_h'=>'す', 'se_h'=>'せ', 'so_h'=>'そ', 'za_h'=>'ざ', 'ji_h'=>'じ', 'zu_h'=>'ず', 'ze_h'=>'ぜ', 'zo_h'=>'ぞ', 'ma_h'=>'ま', 'mi_h'=>'み', 'mu_h'=>'む', 'me_h'=>'め', 'mo_h'=>'も', 'ta_h'=>'た', 'chi_h'=>'ち', 'tsu_h'=>'つ', 'te_h'=>'て', 'to_h'=>'と', 'da_h'=>'だ', 'di_h'=>'ぢ', 'du_h'=>'づ', 'de_h'=>'で', 'do_h'=>'ど', 'na_h'=>'な', 'ni_h'=>'に', 'nu_h'=>'ぬ', 'ne_h'=>'ね', 'no_h'=>'の', 'ha_h'=>'は', 'hi_h'=>'ひ', 'fu_h'=>'ふ', 'he_h'=>'へ', 'ho_h'=>'ほ', 'ba_h'=>'ば', 'bi_h'=>'び', 'bu_h'=>'ぶ', 'be_h'=>'べ', 'bo_h'=>'ぼ', 'pa_h'=>'ぱ', 'pi_h'=>'ぴ', 'pu_h'=>'ぷ', 'pe_h'=>'ぺ', 'po_h'=>'ぽ', 'ra_h'=>'ら', 'ri_h'=>'り', 'ru_h'=>'る', 're_h'=>'れ', 'ro_h'=>'ろ', 'wa_h'=>'わ', 'wo_h'=>'を', 'ya_h'=>'や', 'yu_h'=>'ゆ', 'yo_h'=>'よ', 'n_h'=>'ん', 'wa_h'=>'わ', 'wo_h'=>'を', 'xya_h'=>'ゃ', 'xyu_h'=>'ゅ', 'xyo_h'=>'ょ', 'xa_h'=>'ぁ', 'xi_h'=>'ぃ', 'xu_h'=>'ぅ', 'xe_h'=>'ぇ', 'xo_h'=>'ぉ', 'xtsu_h'=>'っ');
    $small_kana_list = array();
    $tokens = charSpliter($kana_string);
    $result = '';
    $word_length = count($tokens);
    for ($i = 0; $i < $word_length; $i++) {
        $char_key = array_search($tokens[$i], $kana_list);
        if ($char_key !== FALSE) {
            $translation = substr($char_key, 0, strpos($char_key, '_'));
            $buffer = '';
            if (strpos($translation, 'x') === FALSE) {
                $buffer = $translation;
            } else {
                if ($translation == 'xtsu') {
                    $next_token = kana_2_romanji($tokens[$i + 1]);
                    $buffer .= substr($next_token, 0, 1);
                } elseif (strpos($translation, 'x') === 0) {
                    $prev_token = kana_2_romanji($tokens[$i - 1]);
                    $radical = substr($prev_token, 0, strlen($prev_token) - 1);
                    $terminaison = substr($radical, 0, 1) === 'k' || substr($radical, 0, 1) === 'g' ? substr($translation, strlen($translation) - 2) : substr($translation, strlen($translation) - 1);
                    //remove the previous token
                    $result = str_replace($prev_token, '', $result);
                    $buffer .= $radical.$terminaison;
                }//*/
            }
            
            //in case of long vowel とう,しゅう etc..
            $next_token = kana_2_romanji($tokens[$i + 1]);
            $buffer_last_voyel = substr($buffer, strlen($buffer) - 1);
            if ($next_token === 'u' && ($buffer_last_voyel === 'u' || $buffer_last_voyel === 'o')) {
                // here is to add your long vowel rules
                $i++;
            }
            
            $result .= $buffer;
            
        }
    }
    return $result;
}
var_dump(kana_2_romanji('にほんご'));
var_dump(kana_2_romanji('せってい'));
var_dump(kana_2_romanji('しゅうせい'));
var_dump(kana_2_romanji('ぎょうざ'));
?>