- "っ " like in "せってい" (setting) that is translated in "settei".
- Long vowel, actually there is different rules such as adding a "h" (american rule) or a circumflex accent on the vowel (french rule). However, since in general people write "とうきょう" "tokyo" and so on, i decided to just get read of the long vowel.
I know that i am a picky person, but when you work with named entities, it's a minimum to keep them as right as it should be.
The actual code is in PHP but can easily be ported to other languages or modified to convert katakana (were you expecting me to do your work ;)). enjoy !
<?php
// should be the same a the script encoding
mb_regex_encoding("UTF-8");
function charSpliter($str) {
$token = array();
while (1) {
$bytes = mb_ereg("[一-龠]|[ぁ-ん]|[ァ-ヴー]|[a-zA-Z0-9]|[a-zA-Z0-9]", $str, $match);
if ($bytes === false) {
break;
} else {
$match = $match[0];
$token[] = $match;
}
$pos = strpos($str, $match);
$str = substr($str, $pos + $bytes);
}
return $token;
}
function kana_2_romanji($kana_string) {
$kana_list = array('a_h'=>'あ', 'i_h'=>'い', 'u_h'=>'う', 'e_h'=>'え', 'o_h'=>'お', 'ka_h'=>'か', 'ki_h'=>'き', 'ku_h'=>'く', 'ke_h'=>'け', 'ko_h'=>'こ', 'ga_h'=>'が', 'gi_h'=>'ぎ', 'gu_h'=>'ぐ', 'ge_h'=>'げ', 'go_h'=>'ご', 'sa_h'=>'さ', 'shi_h'=>'し', 'su_h'=>'す', 'se_h'=>'せ', 'so_h'=>'そ', 'za_h'=>'ざ', 'ji_h'=>'じ', 'zu_h'=>'ず', 'ze_h'=>'ぜ', 'zo_h'=>'ぞ', 'ma_h'=>'ま', 'mi_h'=>'み', 'mu_h'=>'む', 'me_h'=>'め', 'mo_h'=>'も', 'ta_h'=>'た', 'chi_h'=>'ち', 'tsu_h'=>'つ', 'te_h'=>'て', 'to_h'=>'と', 'da_h'=>'だ', 'di_h'=>'ぢ', 'du_h'=>'づ', 'de_h'=>'で', 'do_h'=>'ど', 'na_h'=>'な', 'ni_h'=>'に', 'nu_h'=>'ぬ', 'ne_h'=>'ね', 'no_h'=>'の', 'ha_h'=>'は', 'hi_h'=>'ひ', 'fu_h'=>'ふ', 'he_h'=>'へ', 'ho_h'=>'ほ', 'ba_h'=>'ば', 'bi_h'=>'び', 'bu_h'=>'ぶ', 'be_h'=>'べ', 'bo_h'=>'ぼ', 'pa_h'=>'ぱ', 'pi_h'=>'ぴ', 'pu_h'=>'ぷ', 'pe_h'=>'ぺ', 'po_h'=>'ぽ', 'ra_h'=>'ら', 'ri_h'=>'り', 'ru_h'=>'る', 're_h'=>'れ', 'ro_h'=>'ろ', 'wa_h'=>'わ', 'wo_h'=>'を', 'ya_h'=>'や', 'yu_h'=>'ゆ', 'yo_h'=>'よ', 'n_h'=>'ん', 'wa_h'=>'わ', 'wo_h'=>'を', 'xya_h'=>'ゃ', 'xyu_h'=>'ゅ', 'xyo_h'=>'ょ', 'xa_h'=>'ぁ', 'xi_h'=>'ぃ', 'xu_h'=>'ぅ', 'xe_h'=>'ぇ', 'xo_h'=>'ぉ', 'xtsu_h'=>'っ');
$small_kana_list = array();
$tokens = charSpliter($kana_string);
$result = '';
$word_length = count($tokens);
for ($i = 0; $i < $word_length; $i++) {
$char_key = array_search($tokens[$i], $kana_list);
if ($char_key !== FALSE) {
$translation = substr($char_key, 0, strpos($char_key, '_'));
$buffer = '';
if (strpos($translation, 'x') === FALSE) {
$buffer = $translation;
} else {
if ($translation == 'xtsu') {
$next_token = kana_2_romanji($tokens[$i + 1]);
$buffer .= substr($next_token, 0, 1);
} elseif (strpos($translation, 'x') === 0) {
$prev_token = kana_2_romanji($tokens[$i - 1]);
$radical = substr($prev_token, 0, strlen($prev_token) - 1);
$terminaison = substr($radical, 0, 1) === 'k' || substr($radical, 0, 1) === 'g' ? substr($translation, strlen($translation) - 2) : substr($translation, strlen($translation) - 1);
//remove the previous token
$result = str_replace($prev_token, '', $result);
$buffer .= $radical.$terminaison;
}//*/
}
//in case of long vowel とう,しゅう etc..
$next_token = kana_2_romanji($tokens[$i + 1]);
$buffer_last_voyel = substr($buffer, strlen($buffer) - 1);
if ($next_token === 'u' && ($buffer_last_voyel === 'u' || $buffer_last_voyel === 'o')) {
// here is to add your long vowel rules
$i++;
}
$result .= $buffer;
}
}
return $result;
}
var_dump(kana_2_romanji('にほんご'));
var_dump(kana_2_romanji('せってい'));
var_dump(kana_2_romanji('しゅうせい'));
var_dump(kana_2_romanji('ぎょうざ'));
?>
3 comments:
il y a un bug : shuusei, pas shusei
désolé, tu avais mis qqchose pour les voyelles longues dans ton code et ton article
par contre :
hyouban
gyuunyu ne passent pas
tu n'as pris que gyo/gyu et kyo/kyu
alors qu'il y a tous les autres
-> ajouter plus de "ou", ligne 40
sinon merci pour ton code, c'est ce que je cherchais !
et super concis en plus de ça, jamais vu un tel travail
omedetou
Salut, oui je me doute que la fonction n'est pas parfaite et merci de vos commentaire, ca aiderais les personnes qui rencontrent ces problemes. Si j'ai le temps je mettrais a jour la fonction.
Post a Comment