pip install janome
def read_file(file_path):
encodings = ['utf-8', 'Shift_JIS', 'euc_jp', 'iso2022_jp']
for enc in encodings:
try:
with open(file_path, 'r', encoding=enc) as f:
text = f.read()
break
except UnicodeDecodeError:
text =""
continue
else:
# どのエンコーディングでもファイルを正常に読み取れなかった場合、エラー出力
raise
return text
from janome.tokenizer import Tokenizer
import os
import re
import sys
def hurigana(text_load, easy_reading=set()):
text_load = text_load.split("\n")
if os.path.exists(Junior := r"小学生で習う漢字一覧のテキストファイル.txt"):
easy_reading = easy_reading.union(set(read_file(Junior)))
Already_read = easy_reading
t = Tokenizer()
write_text = ""
for text in text_load:
# 初期化 例では"["を検出する
if "[" in text:
write_text += text + "\n"
Already_read = easy_reading
continue
novel_text = []
hiragana_text = []
tokens = t.tokenize(text)
for token in tokens:
if len(token.surface) > 0:
novel_text.append(token.surface)
# カタカナが入ってしまってる
hiragana_text.append(token.reading)
is_huri = False
for e, new_text in enumerate(novel_text):
# すでにふりがなが振ってある場合、なにもしない
if "|" in new_text or is_huri:
write_text += novel_text[e]
is_huri = True
Already_read = Already_read.union(set(new_text))
if "》" in new_text:
is_huri = False
else:
new_kanji = False
# 重複漢字スルー
for new in new_text:
if (13312 < ord(new) < 64217 or ord(new) == 12293) and not new in Already_read:
new_kanji = True
if new_kanji:
# 仮の方法
write_text += rf"|{novel_text[e]}《{hiragana_text[e]}》"
Already_read = Already_read.union(set(new_text))
else:
write_text += novel_text[e]
write_text += "\n"
return write_text
if __name__ == "__main__":
# hurigana(read_file(r"封印魔道士は魔道士になる夢を見る.txt"))
print(hurigana(text_load="封印魔道士は魔道士になる夢を見る"))
# 重複漢字スルー
for new in new_text:
if (13312 < ord(new) < 64217 or ord(new) == 12293) and not new in easy_reading:
new_kanji = True
# カタカナをひらがなにする関数
def kata2Hira(target):
return "".join(
[
chr(n - 96) if (12448 < n and n < 12535) or n == 12541 or n == 12542 else chr(n)
for n in [ord(c) for c in target]
]
)
print(kata2Hira("|封印《フウイン》|魔道《マドウ》|士《シ》は魔道士になる|夢《ユメ》を|見る《ミル》"))
def ruby_conversion(kanjis, hiragana):
idiom_list = []
is_Ruby = []
# 漢字の位置検出
hira_position = [
e
for e, hira in enumerate(kanjis)
if (13312 < ord(hira) < 64217 or ord(hira) == 12293)
and not e
in [
e + 1
for e, hira in enumerate(kanjis)
if 13312 < ord(hira) < 64217 or ord(hira) == 12293
]
]
hira_position.append(len(kanjis))
#
first_spel = min(hira_position)
hira_position = [h - first_spel for h in hira_position]
if first_spel:
first_hira = kanjis[:first_spel]
kanjis = kanjis[first_spel:]
hiragana = hiragana[first_spel:]
else:
first_hira = ""
# 単語ごとに切り取り
for e, kanji_choice in enumerate(hira_position):
if e < len(hira_position) - 1:
idiom_list.append(kanjis[hira_position[e] : hira_position[e + 1]])
end_word = True
for e, idiom in enumerate(reversed(idiom_list)):
# 二つ以上ある単語の末尾確認
if e != len(idiom_list) - 1:
e = (e * -1) - 2
before_endspel = idiom_list[e][-1]
end_word = False
start_count = 0
for enu, hira in enumerate(reversed(hiragana)):
# ひらがなの中に単語が入っていないか見ながら、重複阻止
hira_end_count = len(
[c for c in idiom if not 13312 < ord(c) < 64217 and ord(c) != 12293]
)
if not end_word and enu >= hira_end_count:
if before_endspel == hira:
start_count -= 1
if hira_end_count == 0:
hira_end_count = None
is_Ruby.append(
f"|{idiom[: hira_end_count]}《{hiragana[start_count+1 :hira_end_count]}》"
)
else:
hira_end_count = hira_end_count * (-1)
is_Ruby.append(
f"|{idiom[: hira_end_count]}《{hiragana[start_count+1 :hira_end_count]}》{idiom[ hira_end_count :]}"
)
end_word = True
hiragana = hiragana[: start_count + 1]
break
elif enu == len(hiragana) - 1:
assert not enu == len(hiragana) - 1
else:
start_count -= 1
elif not end_word:
start_count -= 1
# 最後の単語はひらがなの文字数に応じて切り取り
elif end_word:
if hira_end_count == 0:
hira_end_count = None
is_Ruby.append(f"|{idiom[: hira_end_count]}《{hiragana[ :hira_end_count]}》")
else:
hira_end_count = hira_end_count * (-1)
is_Ruby.append(
f"|{idiom[: hira_end_count]}《{hiragana[ :hira_end_count]}》{idiom[ hira_end_count :]}"
)
break
re_is_ruby = first_hira + "".join(is_Ruby[::-1])
return re_is_ruby
print(ruby_conversion("封印魔道士は魔道士になる夢を見る", "ふういんまどうしはまどうしになるゆめをみる"))
from janome.tokenizer import Tokenizer
import os
def huriganas(text_load, easy_reading=set()):
text_load = text_load.split("\n")
if os.path.exists(Junior := r"小学生で習う漢字一覧のテキストファイル.txt"):
easy_reading = easy_reading.union(set(read_file(Junior)))
Already_read = easy_reading
t = Tokenizer()
write_text = ""
for text in text_load:
# 初期化 例では"["を検出するつと
if "[" in text:
write_text += text + "\n"
Already_read = easy_reading
continue
novel_text = []
hiragana_text = []
tokens = t.tokenize(text)
for token in tokens:
if len(token.surface) > 0:
novel_text.append(token.surface)
hiragana_text.append(kata2Hira(token.reading))
is_huri = False
for e, new_text in enumerate(novel_text):
if "|" in new_text or is_huri:
write_text += novel_text[e]
is_huri = True
Already_read = Already_read.union(set(new_text))
if "》" in new_text:
is_huri = False
else:
new_kanji = False
for new in new_text:
if (13312 < ord(new) < 64217 or ord(new) == 12293) and not new in Already_read:
new_kanji = True
if new_kanji:
if not len(
[i for i in hiragana_text[e] if (13312 < ord(i) < 64217 or ord(i) == 12293)]
):
write_text += ruby_conversion(novel_text[e], hiragana_text[e])
Already_read = Already_read.union(set(new_text))
else:
write_text += novel_text[e]
else:
write_text += novel_text[e]
write_text += "\n"
return write_text
if __name__ == "__main__":
print(huriganas(text_load="封印魔道士は魔道士なる夢を見る"))
if __name__ == "__main__":
text_file_path = r"Windows10の場合、Shift + 右クリックでパスのコピーがでます.txt"
ruby_text = hurigana(read_file(text_file_path))
ruby_path = os.path.abspath(text_file_path)[:-4] + "_ruby.txt"
with open(ruby_path, "w", encoding="utf-8") as f:
f.write(ruby_text)
text_file_path = r"コピーしたファイルパスをベースト"