#!/usr/bin/python2.7
# coding: utf-8
# Copyright 2010 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions specific for Japanese language."""
import re
import unicodedata
# Hiragana to romaji.
# This table is copied from: http://code.google.com/p/mozc/source/browse/trunk/src/data/preedit/hiragana-romanji.tsv
HIRAGANA_TO_ROMAJI = [
[u"う゛ぁ", u"VA", u""],
[u"う゛ぃ", u"VI", u""],
[u"う゛", u"VU", u""],
[u"う゛ぇ", u"VE", u""],
[u"う゛ぉ", u"VO", u""],
[u"う゛ゃ", u"VYA", u""],
[u"う゛ゅ", u"VYU", u""],
[u"う゛ょ", u"VYO", u""],
[u"っう゛", u"V", u"ゔ"],
[u"ゔぁ", u"VA", u""],
[u"ゔぃ", u"VI", u""],
[u"ゔ", u"VU", u""],
[u"ゔぇ", u"VE", u""],
[u"ゔぉ", u"VO", u""],
[u"ゔゃ", u"VYA", u""],
[u"ゔゅ", u"VYU", u""],
[u"ゔょ", u"VYO", u""],
[u"っゔ", u"V", u"ゔ"],
[u"っうぁ", u"WWA", u""],
[u"っうぃ", u"WWI", u""],
[u"っう", u"WWU", u""],
[u"っうぇ", u"WWE", u""],
[u"っうぉ", u"WWO", u""],
[u"っぁ", u"XXA", u""],
[u"っぃ", u"XXI", u""],
[u"っぅ", u"XXU", u""],
[u"っぇ", u"XXE", u""],
[u"っぉ", u"XXO", u""],
[u"っか", u"KKA", u""],
[u"っき", u"K", u"き"],
[u"っく", u"KKU", u""],
[u"っけ", u"KKE", u""],
[u"っこ", u"KKO", u""],
[u"っが", u"GGA", u""],
[u"っぎ", u"G", u"ぎ"],
[u"っぐ", u"GGU", u""],
[u"っげ", u"GGE", u""],
[u"っご", u"GGO", u""],
[u"っさ", u"SSA", u""],
[u"っし", u"S", u"し"],
[u"っす", u"SSU", u""],
[u"っせ", u"SSE", u""],
[u"っそ", u"SSO", u""],
[u"っざ", u"ZZA", u""],
[u"っじ", u"Z", u"じ"],
[u"っず", u"ZZU", u""],
[u"っぜ", u"ZZE", u""],
[u"っぞ", u"ZZO", u""],
[u"った", u"TTA", u""],
[u"っち", u"C", u"ち"],
[u"っつ", u"TTU", u""],
[u"って", u"TTE", u""],
[u"っと", u"TTO", u""],
[u"っだ", u"DDA", u""],
[u"っぢ", u"D", u"ぢ"],
[u"っづ", u"DDU", u""],
[u"っで", u"DDE", u""],
[u"っど", u"DDO", u""],
[u"っは", u"HHA", u""],
[u"っひ", u"H", u"ひ"],
[u"っふ", u"HHU", u""],
[u"っへ", u"HHE", u""],
[u"っほ", u"HHO", u""],
[u"っば", u"BBA", u""],
[u"っび", u"B", u"び"],
[u"っぶ", u"BBU", u""],
[u"っべ", u"BBE", u""],
[u"っぼ", u"BBO", u""],
[u"っぱ", u"PPA", u""],
[u"っぴ", u"P", u"ぴ"],
[u"っぷ", u"PPU", u""],
[u"っぺ", u"PPE", u""],
[u"っぽ", u"PPO", u""],
[u"っま", u"MMA", u""],
[u"っみ", u"M", u"み"],
[u"っむ", u"MMU", u""],
[u"っめ", u"MME", u""],
[u"っも", u"MMO", u""],
[u"っや", u"YYA", u""],
[u"っゆ", u"YYU", u""],
[u"っよ", u"YYO", u""],
[u"っゃ", u"XXYA", u""],
[u"っゅ", u"XXYU", u""],
[u"っょ", u"XXYO", u""],
[u"っら", u"RRA", u""],
[u"っり", u"R", u"り"],
[u"っる", u"RRU", u""],
[u"っれ", u"RRE", u""],
[u"っろ", u"RRO", u""],
[u"っゎ", u"XXWA", u""],
[u"っわ", u"WWA", u""],
[u"っゐ", u"WWI", u""],
[u"っゑ", u"WWE", u""],
[u"っを", u"WWO", u""],
[u"いぇ", u"YE", u""],
[u"うぁ", u"WA", u""],
[u"きゃ", u"KYA", u""],
[u"きぃ", u"KYI", u""],
[u"きゅ", u"KYU", u""],
[u"きぇ", u"KYE", u""],
[u"きょ", u"KYO", u""],
[u"ぎゃ", u"GYA", u""],
[u"ぎぃ", u"GYI", u""],
[u"ぎゅ", u"GYU", u""],
[u"ぎぇ", u"GYE", u""],
[u"ぎょ", u"GYO", u""],
[u"くぁ", u"QA", u""],
[u"くぃ", u"QI", u""],
[u"くぇ", u"QE", u""],
[u"くぉ", u"QO", u""],
[u"しゃ", u"SHA", u""],
[u"しぃ", u"SHI", u""],
[u"しゅ", u"SHU", u""],
[u"しぇ", u"SHE", u""],
[u"しょ", u"SHO", u""],
[u"じゃ", u"JA", u""],
[u"じぃ", u"ZYI", u""],
[u"じゅ", u"JU", u""],
[u"じぇ", u"JE", u""],
[u"じょ", u"JO", u""],
[u"ちゃ", u"CHA", u""],
[u"ちゅ", u"CHU", u""],
[u"ちぇ", u"CHE", u""],
[u"ちょ", u"CYO", u""],
[u"ぢゃ", u"DYA", u""],
[u"ぢぃ", u"DYI", u""],
[u"ぢゅ", u"DYU", u""],
[u"ぢぇ", u"DYE", u""],
[u"ぢょ", u"DYO", u""],
[u"つぁ", u"TSA", u""],
[u"つぃ", u"TSI", u""],
[u"つぇ", u"TSE", u""],
[u"つぉ", u"TSO", u""],
[u"てゃ", u"THA", u""],
[u"てぃ", u"THI", u""],
[u"てゅ", u"THU", u""],
[u"てぇ", u"THE", u""],
[u"てょ", u"THO", u""],
[u"でゃ", u"DHA", u""],
[u"でぃ", u"DHI", u""],
[u"でゅ", u"DHU", u""],
[u"でぇ", u"DHE", u""],
[u"でょ", u"DHO", u""],
[u"とぁ", u"TWA", u""],
[u"とぃ", u"TWI", u""],
[u"とぅ", u"TWU", u""],
[u"とぇ", u"TWE", u""],
[u"とぉ", u"TWO", u""],
[u"どぁ", u"DWA", u""],
[u"どぃ", u"DWI", u""],
[u"どぅ", u"DWU", u""],
[u"どぇ", u"DWE", u""],
[u"どぉ", u"DWO", u""],
[u"にゃ", u"NYA", u""],
[u"にぃ", u"NYI", u""],
[u"にゅ", u"NYU", u""],
[u"にぇ", u"NYE", u""],
[u"にょ", u"NYO", u""],
[u"ひゃ", u"HYA", u""],
[u"ひぃ", u"HYI", u""],
[u"ひゅ", u"HYU", u""],
[u"ひぇ", u"HYE", u""],
[u"ひょ", u"HYO", u""],
[u"びゃ", u"BYA", u""],
[u"びぃ", u"BYI", u""],
[u"びゅ", u"BYU", u""],
[u"びぇ", u"BYE", u""],
[u"びょ", u"BYO", u""],
[u"ぴゃ", u"PYA", u""],
[u"ぴぃ", u"PYI", u""],
[u"ぴゅ", u"PYU", u""],
[u"ぴぇ", u"PYE", u""],
[u"ぴょ", u"PYO", u""],
[u"ふゃ", u"FYA", u""],
[u"ふゅ", u"FYU", u""],
[u"ふょ", u"FYO", u""],
[u"みゃ", u"MYA", u""],
[u"みぃ", u"MYI", u""],
[u"みゅ", u"MYU", u""],
[u"みぇ", u"MYE", u""],
[u"みょ", u"MYO", u""],
[u"りゃ", u"RYA", u""],
[u"りぃ", u"RYI", u""],
[u"りゅ", u"RYU", u""],
[u"りぇ", u"RYE", u""],
[u"りょ", u"RYO", u""],
[u"んあ", u"NNA", u""],
[u"んい", u"NNI", u""],
[u"んう", u"NNU", u""],
[u"んえ", u"NNE", u""],
[u"んお", u"NNO", u""],
[u"んな", u"NNNA", u""],
[u"んに", u"NNNI", u""],
[u"んぬ", u"NNNU", u""],
[u"んね", u"NNNE", u""],
[u"んの", u"NNNO", u""],
[u"あ", u"A", u""],
[u"い", u"I", u""],
[u"う", u"U", u""],
[u"え", u"E", u""],
[u"お", u"O", u""],
[u"ぁ", u"XA", u""],
[u"ぃ", u"XI", u""],
[u"ぅ", u"XU", u""],
[u"ぇ", u"XE", u""],
[u"ぉ", u"XO", u""],
[u"か", u"KA", u""],
[u"き", u"KI", u""],
[u"く", u"KU", u""],
[u"け", u"KE", u""],
[u"こ", u"KO", u""],
[u"ヵ", u"XKA", u""],
[u"ヶ", u"XKE", u""],
[u"が", u"GA", u""],
[u"ぎ", u"GI", u""],
[u"ぐ", u"GU", u""],
[u"げ", u"GE", u""],
[u"ご", u"GO", u""],
[u"さ", u"SA", u""],
[u"し", u"SHI", u""],
[u"す", u"SU", u""],
[u"せ", u"SE", u""],
[u"そ", u"SO", u""],
[u"ざ", u"ZA", u""],
[u"じ", u"JI", u""],
[u"ず", u"ZU", u""],
[u"ぜ", u"ZE", u""],
[u"ぞ", u"ZO", u""],
[u"た", u"TA", u""],
[u"ち", u"CHI", u""],
[u"つ", u"TSU", u""],
[u"て", u"TE", u""],
[u"と", u"TO", u""],
[u"だ", u"DA", u""],
[u"ぢ", u"DI", u""],
[u"づ", u"DU", u""],
[u"で", u"DE", u""],
[u"ど", u"DO", u""],
[u"っ", u"XTU", u""],
[u"な", u"NA", u""],
[u"に", u"NI", u""],
[u"ぬ", u"NU", u""],
[u"ね", u"NE", u""],
[u"の", u"NO", u""],
[u"は", u"HA", u""],
[u"ひ", u"HI", u""],
[u"ふ", u"HU", u""],
[u"へ", u"HE", u""],
[u"ほ", u"HO", u""],
[u"ば", u"BA", u""],
[u"び", u"BI", u""],
[u"ぶ", u"BU", u""],
[u"べ", u"BE", u""],
[u"ぼ", u"BO", u""],
[u"ぱ", u"PA", u""],
[u"ぴ", u"PI", u""],
[u"ぷ", u"PU", u""],
[u"ぺ", u"PE", u""],
[u"ぽ", u"PO", u""],
[u"ま", u"MA", u""],
[u"み", u"MI", u""],
[u"む", u"MU", u""],
[u"め", u"ME", u""],
[u"も", u"MO", u""],
[u"ゃ", u"XYA", u""],
[u"や", u"YA", u""],
[u"ゅ", u"XYU", u""],
[u"ゆ", u"YU", u""],
[u"ょ", u"XYO", u""],
[u"よ", u"YO", u""],
[u"ら", u"RA", u""],
[u"り", u"RI", u""],
[u"る", u"RU", u""],
[u"れ", u"RE", u""],
[u"ろ", u"RO", u""],
[u"ゎ", u"XWA", u""],
[u"わ", u"WA", u""],
[u"ゐ", u"WI", u""],
[u"ゑ", u"WE", u""],
[u"を", u"WO", u""],
[u"ん", u"N", u""],
[u"ー", u"-", u""],
[u"〜", u"~", u""],
]
HIRAGANA_TO_ROMAJI_POST_PROCESS = [
[r'AA', u'A'], [r'II', u'I'], [r'UU', u'U'], [r'EE', u'E'],
[r'OO', u'O'], [r'OU', u'O'],
]
# Katakana to hiragana.
KATAKANA_TO_HIRAGANA = {
u'ァ': u'ぁ', u'ィ': u'ぃ', u'ゥ': u'ぅ',
u'ェ': u'ぇ', u'ォ': u'ぉ',
u'ッ': u'っ', u'ャ': u'ゃ',
u'ュ': u'ゅ', u'ョ': u'ょ',
u'ヮ': u'ゎ', u'ヶ': u'ヶ', u'ヵ': u'ヵ',
u'ガ': u'が', u'ギ': u'ぎ',
u'グ': u'ぐ', u'ゲ': u'げ', u'ゴ': u'ご',
u'ザ': u'ざ', u'ジ': u'じ',
u'ズ': u'ず', u'ゼ': u'ぜ', u'ゾ': u'ぞ',
u'ダ': u'だ', u'ヂ': u'ぢ',
u'ヅ': u'づ', u'デ': u'で', u'ド': u'ど',
u'バ': u'ば', u'ビ': u'び', u'ブ': u'ぶ',
u'ベ': u'べ', u'ボ': u'ぼ',
u'パ': u'ぱ', u'ピ': u'ぴ', u'プ': u'ぷ',
u'ペ': u'ぺ', u'ポ': u'ぽ',
u'ヴ': u'ゔ', u'ヰ': u'ゐ', u'ヱ': u'ゑ',
u'ア': u'あ', u'イ': u'い', u'ウ': u'う',
u'エ': u'え', u'オ': u'お',
u'カ': u'か', u'キ': u'き', u'ク': u'く',
u'ケ': u'け', u'コ': u'こ',
u'サ': u'さ', u'シ': u'し', u'ス': u'す',
u'セ': u'せ', u'ソ': u'そ',
u'タ': u'た', u'チ': u'ち', u'ツ': u'つ',
u'テ': u'て', u'ト': u'と',
u'ナ': u'な', u'ニ': u'に', u'ヌ': u'ぬ',
u'ネ': u'ね', u'ノ': u'の',
u'ハ': u'は', u'ヒ': u'ひ', u'フ': u'ふ',
u'ヘ': u'へ', u'ホ': u'ほ',
u'マ': u'ま', u'ミ': u'み', u'ム': u'む',
u'メ': u'め', u'モ': u'も',
u'ヤ': u'や', u'ユ': u'ゆ', u'ヨ': u'よ',
u'ラ': u'ら', u'リ': u'り', u'ル': u'る',
u'レ': u'れ', u'ロ': u'ろ',
u'ワ': u'わ', u'ヲ': u'を', u'ン': u'ん',
u'ー': u'ー',
}
HIRAGANA_NORMALIZATION = {
u'ぢ': u'じ', u'づ': u'ず', u'ゐ': u'い', u'ゑ': u'え',
}
# Dictionary of characters ([\u3000-\u9fff]) that are popularly used as part
# of Japanese names with their relative frequency counts. This dictionary is
# generated by aggregating names_prefixes of about 520k Person entries from the
# Japan instance as of 03-25-2011. We retained only the ones with frequnecy
# counts >= 500.
NAME_CHAR_POPULARITY_MAP = {
u'子': 125824, u'藤': 58115, u'田': 45545, u'佐': 43413, u'木': 39214,
u'美': 37144, u'野': 32917, u'山': 26263, u'小': 23586, u'一': 21586,
u'川': 20002, u'部': 19426, u'村': 18580, u'大': 16649, u'高': 16240,
u'阿': 15842, u'々': 15668, u'橋': 14894, u'千': 14265, u'和': 13300,
u'鈴': 13253, u'三': 12638, u'本': 12600, u'原': 12542, u'恵': 11862,
u'幸': 10975, u'谷': 10926, u'井': 10918, u'松': 10633, u'正': 10329,
u'雄': 10279, u'み': 10069, u'中': 9645, u'菅': 9505, u'夫': 9464,
u'さ': 9359, u'渡': 9075, u'久': 9055, u'吉': 8925, u'郎': 8882,
u'上': 8874, u'男': 8641, u'浦': 8208, u'あ': 8036, u'葉': 7945,
u'伊': 7756, u'真': 7627, u'と': 7616, u'太': 7528, u'か': 7479,
u'石': 7477, u'澤': 7365, u'ま': 7270, u'由': 7266, u'た': 7233,
u'平': 7099, u'内': 7009, u'き': 6973, u'明': 6826, u'志': 6776,
u'菊': 6750, u'勝': 6731, u'寺': 6682, u'崎': 6672, u'代': 6435,
u'遠': 6407, u'江': 6298, u'ひ': 6085, u'辺': 6046, u'智': 5991,
u'香': 5929, u'也': 5844, u'光': 5757, u'清': 5744, u'紀': 5629,
u'ゆ': 5517, u'奈': 5263, u'弘': 5239, u'熊': 5221, u'武': 5218,
u'治': 5175, u'裕': 5113, u'孝': 5061, u'沢': 5045, u'喜': 5018,
u'洋': 5007, u'二': 4899, u'信': 4875, u'加': 4823, u'里': 4747,
u'し': 4718, u'及': 4678, u'秀': 4655, u'よ': 4639, u'林': 4565,
u'彦': 4550, u'文': 4516, u'義': 4503, u'樹': 4446, u'島': 4412,
u'良': 4369, u'賀': 4342, u'岩': 4306, u'岡': 4245, u'森': 4218,
u'昭': 4150, u'利': 4146, u'金': 4142, u'安': 4108, u'人': 4089,
u'枝': 4067, u'坂': 4063, u'之': 4059, u'西': 3980, u'友': 3971,
u'貴': 3948, u'池': 3925, u'沼': 3894, u'英': 3862, u'博': 3821,
u'司': 3821, u'浩': 3789, u'保': 3765, u'今': 3753, u'新': 3695,
u'地': 3660, u'は': 3636, u'佳': 3618, u'ち': 3600, u'海': 3542,
u'芳': 3528, u'健': 3498, u'ふ': 3490, u'津': 3458, u'長': 3453,
u'栄': 3448, u'俊': 3418, u'隆': 3417, u'斉': 3413, u'こ': 3406,
u'敏': 3371, u'口': 3351, u'理': 3276, u'優': 3263, u'間': 3248,
u'す': 3169, u'永': 3168, u'下': 3153, u'直': 3094, u'広': 3048,
u'宏': 2995, u'や': 2987, u'春': 2953, u'愛': 2919, u'お': 2891,
u'塚': 2887, u'富': 2879, u'畠': 2805, u'な': 2804, u'行': 2790,
u'祐': 2789, u'い': 2787, u'尾': 2761, u'戸': 2753, u'宮': 2699,
u'希': 2656, u'横': 2554, u'康': 2534, u'り': 2516, u'生': 2515,
u'八': 2496, u'仁': 2495, u'の': 2469, u'輝': 2458, u'政': 2452,
u'成': 2444, u'徳': 2436, u'後': 2398, u'水': 2390, u'雅': 2386,
u'須': 2373, u'忠': 2370, u'悦': 2364, u'重': 2359, u'花': 2357,
u'茂': 2352, u'福': 2315, u'陽': 2278, u'相': 2273, u'根': 2272,
u'寿': 2263, u'倉': 2254, u'斎': 2244, u'馬': 2228, u'京': 2222,
u'日': 2213, u'知': 2173, u'菜': 2145, u'介': 2129, u'次': 2119,
u'泉': 2080, u'つ': 2076, u'く': 2074, u'東': 2052, u'星': 2040,
u'実': 2023, u'け': 2007, u'黒': 1977, u'せ': 1969, u'豊': 1962,
u'則': 1939, u'穂': 1913, u'齋': 1903, u'史': 1884, u'勇': 1873,
u'門': 1842, u'昌': 1839, u'荒': 1831, u'誠': 1818, u'て': 1800,
u'麻': 1788, u'形': 1784, u'純': 1780, u'伸': 1748, u'道': 1746,
u'堀': 1743, u'典': 1741, u'ゐ': 1732, u'杉': 1732, u'城': 1728,
u'竹': 1727, u'古': 1724, u'青': 1691, u'静': 1655, u'哲': 1648,
u'克': 1647, u'泰': 1633, u'淳': 1629, u'早': 1628, u'関': 1611,
u'前': 1601, u'畑': 1581, u'士': 1575, u'哉': 1573, u'白': 1565,
u'亜': 1560, u'秋': 1559, u'拓': 1531, u'晴': 1525, u'幡': 1519,
u'修': 1516, u'善': 1513, u'翔': 1498, u'節': 1489, u'弥': 1455,
u'貞': 1424, u'順': 1418, u'見': 1416, u'場': 1415, u'沙': 1409,
u'啓': 1403, u'市': 1402, u'舘': 1401, u'好': 1392, u'浅': 1387,
u'瀬': 1386, u'達': 1378, u'猪': 1359, u'敬': 1354, u'有': 1353,
u'元': 1353, u'照': 1329, u'公': 1323, u'多': 1310, u'合': 1309,
u'庄': 1294, u'吾': 1290, u'え': 1282, u'工': 1281, u'矢': 1278,
u'賢': 1271, u'笠': 1268, u'登': 1264, u'赤': 1260, u'土': 1255,
u'織': 1247, u'五': 1245, u'嶋': 1244, u'斗': 1244, u'晃': 1221,
u'う': 1213, u'桜': 1212, u'悠': 1202, u'狩': 1198, u'末': 1194,
u'章': 1193, u'彩': 1191, u'綾': 1188, u'尚': 1188, u'近': 1181,
u'丹': 1179, u'亀': 1173, u'亮': 1164, u'絵': 1149, u'寛': 1139,
u'北': 1136, u'衣': 1133, u'邦': 1127, u'憲': 1127, u'米': 1118,
u'輔': 1115, u'郁': 1108, u'玉': 1104, u'奥': 1095, u'国': 1094,
u'妻': 1084, u'満': 1082, u'守': 1082, u'夏': 1082, u'羽': 1068,
u'ゑ': 1060, u'百': 1046, u'鎌': 1045, u'河': 1021, u'玲': 1017,
u'朗': 1007, u'梨': 997, u'乃': 994, u'わ': 966, u'家': 962, u'條': 960,
u'華': 960, u'聖': 956, u'慶': 953, u'柴': 950, u'越': 945, u'咲': 944,
u'も': 939, u'龍': 937, u'れ': 933, u'梅': 928, u'十': 922, u'未': 920,
u'柳': 920, u'圭': 913, u'邊': 904, u'進': 902, u'嘉': 890, u'澄': 887,
u'佑': 887, u'浜': 887, u'屋': 885, u'恭': 884, u'功': 876, u'敦': 875,
u'剛': 874, u'若': 868, u'礼': 857, u'将': 833, u'南': 829, u'作': 828,
u'蔵': 819, u'紗': 817, u'月': 814, u'髙': 800, u'細': 799, u'音': 796,
u'名': 796, u'神': 791, u'出': 787, u'又': 785, u'結': 784, u'祥': 784,
u'四': 783, u'歩': 781, u'梶': 775, u'莉': 775, u'め': 769, u'渕': 768,
u'聡': 764, u'宗': 763, u'央': 761, u'片': 757, u'昆': 757, u'稲': 752,
u'る': 751, u'廣': 751, u'ほ': 750, u'鶴': 744, u'目': 738, u'紺': 737,
u'来': 731, u'丸': 726, u'七': 726, u'那': 721, u'じ': 714, u'櫻': 712,
u'繁': 708, u'瑞': 705, u'世': 703, u'飯': 700, u'酒': 699, u'半': 698,
u'植': 685, u'朋': 682, u'助': 681, u'初': 676, u'睦': 673, u'眞': 671,
u'竜': 663, u'悟': 663, u'靖': 658, u'盛': 655, u'卓': 655, u'定': 650,
u'板': 649, u'都': 645, u'宇': 643, u'慎': 642, u'宍': 638, u'学': 636,
u'力': 635, u'柏': 631, u'立': 628, u'草': 626, u'鳥': 624, u'嵐': 621,
u'充': 617, u'昇': 615, u'徹': 608, u'朝': 607, u'栗': 605, u'邉': 599,
u'緒': 596, u'律': 595, u'妙': 593, u'辰': 593, u'濱': 592, u'む': 585,
u'岸': 584, u'天': 580, u'旧': 580, u'君': 577, u'涼': 572, u'稔': 567,
u'雪': 564, u'彰': 563, u'心': 557, u'槻': 556, u'己': 555, u'勉': 544,
u'冨': 542, u'居': 541, u'房': 539, u'姓': 539, u'恒': 538, u'征': 538,
u'桂': 535, u'に': 534, u'薫': 531, u'望': 529, u'垣': 526, u'厚': 524,
u'笹': 523, u'巳': 522, u'渋': 518, u'瓶': 506, u'洞': 505, u'湊': 505,
u'瑠': 505, u'波': 505, u'幹': 504, u'角': 501, u'弓': 500,
}
assert len(NAME_CHAR_POPULARITY_MAP) == 474
[docs]def should_normalize(string):
"""Checks if the string should be normalized by jautils.normalize() as
opposed to text_query.normalize().
Args:
string: a unicode string to check.
Returns:
True if the string should be normalized by jautils.normalize().
"""
# Does the string contains any of the following characters?
# - hiragana
# - full/half width katakana
# - full width alphabets
return re.search(ur'[\u3040-\u30ff\uff00-\uff9f]', string) != None
[docs]def normalize(string):
"""Normalizes the string with a Japanese specific logic.
Args:
string: a unicode string to normalize.
Returns:
a unicode string obtained by normalizing the input string.
"""
# NFKC normalization does the followings:
# - Full width roman letter to ascii
# - Whitespace characters to " "
# - Half width katakana to full width
letters = []
for ch in unicodedata.normalize('NFKC', string):
# Remove non-letter characters.
category = unicodedata.category(ch)
if category.startswith('L'):
letters.append(ch)
elif category != 'Mn' and ch != "'": # Treat O'Hearn as OHEARN
letters.append(' ')
normalized = ''.join(letters).strip().upper()
normalized = katakana_to_hiragana(normalized)
normalized = normalize_hiragana(normalized)
return normalized
[docs]def is_hiragana(string):
"""Returns True if the argument is a non-empty string of only
hiragana characters."""
return re.match(ur'^[\u3040-\u309f]+$', string) != None
[docs]def normalize_hiragana(string):
"""Normalizes hiragana characters to absorb confusing spelling variations.
Args:
string: a unicode string, possibly containing hiragana characters.
Returns:
The normalized string.
"""
return u''.join([HIRAGANA_NORMALIZATION.get(ch, ch) for ch in string])
[docs]def katakana_to_hiragana(string):
"""Replaces each occurrence of katakana in a unicode string with a hiragana.
Args:
string: a unicode string, possibly containing katakana characters.
Returns:
The replaced string.
"""
replaced = u''
for ch in string:
replaced += KATAKANA_TO_HIRAGANA.get(ch, ch)
return replaced
[docs]def hiragana_to_romaji(string):
"""Replaces each occurrence of hiragana in a unicode string with a romaji.
Args:
string: a unicode string, possibly containing hiragana characters.
Returns:
The replaced string.
"""
remaining = string
result = u''
while remaining:
longest = 0
longest_data = None
for (hira, rom, next) in HIRAGANA_TO_ROMAJI:
if remaining.startswith(hira) and len(hira) > longest:
longest_data = (hira, rom, next)
longest = len(hira)
if longest == 0:
# erroneous info
result += remaining[0]
remaining = remaining[1:]
else:
result += longest_data[1]
remaining = longest_data[2] + remaining[len(longest_data[0]):]
for (pat, rep) in HIRAGANA_TO_ROMAJI_POST_PROCESS:
result = re.sub(pat, rep, result)
return result
[docs]def get_additional_tokens(tokens):
"""Generates new tokens by combining tokens and converting them to various
character representations, which can be used as search index tokens.
Args:
tokens: a list or set of unicode strings to expand from.
Returns:
A set of newly generated tokens to add to the search index.
"""
expanded_tokens = set()
all_hiragana = True
for token in tokens:
if is_hiragana(token):
# Adds romaji variation of the token so that people without an IME
# can still search for Japanese names.
expanded_tokens.add(hiragana_to_romaji(token))
else:
all_hiragana = False
# Japanese users often search by hiragana's where a family name and a given
# name is concatenated without a space in between. Because a sequence of
# hiragana's is not segmented at query time, we need to add those
# concatenated tokens to the index to make them searchable.
# len(tokens) == 2 should almost always hold when used against Japanese
# alternate names (one hiragana token for given name and another hiragana
# token for family name.)
if all_hiragana and len(tokens) == 2:
token_list = list(tokens)
expanded_tokens.add(token_list[0] + token_list[1])
expanded_tokens.add(token_list[1] + token_list[0])
return expanded_tokens
[docs]def sorted_by_popularity(tokens):
"""Sort tokens according to popularity (see NAME_CHAR_POPULARITY_MAP) so
that tokens that are LESS popular in Japanese names come first, and return
the sorted tokens.
Args:
tokens: tokens to sort.
Returns:
Sorted tokens.
"""
return sorted(tokens, key=lambda t: NAME_CHAR_POPULARITY_MAP.get(t, 0))