- "っ " like in "せってい" (setting) that is translated in "settei".
- Long vowel, actually there is different rules such as adding a "h" (american rule) or a circumflex accent on the vowel (french rule). However, since in general people write "とうきょう" "tokyo" and so on, i decided to just get read of the long vowel.
I know that i am a picky person, but when you work with named entities, it's a minimum to keep them as right as it should be.
The actual code is in PHP but can easily be ported to other languages or modified to convert katakana (were you expecting me to do your work ;)). enjoy !
<?php
// should be the same a the script encoding
mb_regex_encoding("UTF-8");
function charSpliter($str) {
$token = array();
while (1) {
$bytes = mb_ereg("[一-龠]|[ぁ-ん]|[ァ-ヴー]|[a-zA-Z0-9]|[a-zA-Z0-9]", $str, $match);
if ($bytes === false) {
break;
} else {
$match = $match[0];
$token[] = $match;
}
$pos = strpos($str, $match);
$str = substr($str, $pos + $bytes);
}
return $token;
}
function kana_2_romanji($kana_string) {
$kana_list = array('a_h'=>'あ', 'i_h'=>'い', 'u_h'=>'う', 'e_h'=>'え', 'o_h'=>'お', 'ka_h'=>'か', 'ki_h'=>'き', 'ku_h'=>'く', 'ke_h'=>'け', 'ko_h'=>'こ', 'ga_h'=>'が', 'gi_h'=>'ぎ', 'gu_h'=>'ぐ', 'ge_h'=>'げ', 'go_h'=>'ご', 'sa_h'=>'さ', 'shi_h'=>'し', 'su_h'=>'す', 'se_h'=>'せ', 'so_h'=>'そ', 'za_h'=>'ざ', 'ji_h'=>'じ', 'zu_h'=>'ず', 'ze_h'=>'ぜ', 'zo_h'=>'ぞ', 'ma_h'=>'ま', 'mi_h'=>'み', 'mu_h'=>'む', 'me_h'=>'め', 'mo_h'=>'も', 'ta_h'=>'た', 'chi_h'=>'ち', 'tsu_h'=>'つ', 'te_h'=>'て', 'to_h'=>'と', 'da_h'=>'だ', 'di_h'=>'ぢ', 'du_h'=>'づ', 'de_h'=>'で', 'do_h'=>'ど', 'na_h'=>'な', 'ni_h'=>'に', 'nu_h'=>'ぬ', 'ne_h'=>'ね', 'no_h'=>'の', 'ha_h'=>'は', 'hi_h'=>'ひ', 'fu_h'=>'ふ', 'he_h'=>'へ', 'ho_h'=>'ほ', 'ba_h'=>'ば', 'bi_h'=>'び', 'bu_h'=>'ぶ', 'be_h'=>'べ', 'bo_h'=>'ぼ', 'pa_h'=>'ぱ', 'pi_h'=>'ぴ', 'pu_h'=>'ぷ', 'pe_h'=>'ぺ', 'po_h'=>'ぽ', 'ra_h'=>'ら', 'ri_h'=>'り', 'ru_h'=>'る', 're_h'=>'れ', 'ro_h'=>'ろ', 'wa_h'=>'わ', 'wo_h'=>'を', 'ya_h'=>'や', 'yu_h'=>'ゆ', 'yo_h'=>'よ', 'n_h'=>'ん', 'wa_h'=>'わ', 'wo_h'=>'を', 'xya_h'=>'ゃ', 'xyu_h'=>'ゅ', 'xyo_h'=>'ょ', 'xa_h'=>'ぁ', 'xi_h'=>'ぃ', 'xu_h'=>'ぅ', 'xe_h'=>'ぇ', 'xo_h'=>'ぉ', 'xtsu_h'=>'っ');
$small_kana_list = array();
$tokens = charSpliter($kana_string);
$result = '';
$word_length = count($tokens);
for ($i = 0; $i < $word_length; $i++) {
$char_key = array_search($tokens[$i], $kana_list);
if ($char_key !== FALSE) {
$translation = substr($char_key, 0, strpos($char_key, '_'));
$buffer = '';
if (strpos($translation, 'x') === FALSE) {
$buffer = $translation;
} else {
if ($translation == 'xtsu') {
$next_token = kana_2_romanji($tokens[$i + 1]);
$buffer .= substr($next_token, 0, 1);
} elseif (strpos($translation, 'x') === 0) {
$prev_token = kana_2_romanji($tokens[$i - 1]);
$radical = substr($prev_token, 0, strlen($prev_token) - 1);
$terminaison = substr($radical, 0, 1) === 'k' || substr($radical, 0, 1) === 'g' ? substr($translation, strlen($translation) - 2) : substr($translation, strlen($translation) - 1);
//remove the previous token
$result = str_replace($prev_token, '', $result);
$buffer .= $radical.$terminaison;
}//*/
}
//in case of long vowel とう,しゅう etc..
$next_token = kana_2_romanji($tokens[$i + 1]);
$buffer_last_voyel = substr($buffer, strlen($buffer) - 1);
if ($next_token === 'u' && ($buffer_last_voyel === 'u' || $buffer_last_voyel === 'o')) {
// here is to add your long vowel rules
$i++;
}
$result .= $buffer;
}
}
return $result;
}
var_dump(kana_2_romanji('にほんご'));
var_dump(kana_2_romanji('せってい'));
var_dump(kana_2_romanji('しゅうせい'));
var_dump(kana_2_romanji('ぎょうざ'));
?>