-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
import os
import pandas as pd
from aozora_corpus_generator.aozora import (
read_aozora_bunko_list,
read_aozora_bunko_xml
)
# 青空文庫リポジトリのクローン/パス
AOZORA_REPO = 'path/to/aozorabunko' # 実際のローカルパスに置き換え
CSV_LIST = os.path.join(AOZORA_REPO, 'list_person_all_extended_utf8.zip')
def fetch_tanka_texts():
# 全作品メタの読み込み
aozora_db = read_aozora_bunko_list(CSV_LIST, ndc_map=None)
data = []
for author, works in aozora_db.items():
for title, meta in works.items():
if '短歌' in title or '和歌集' in title:
path = os.path.join(AOZORA_REPO, meta['file_path'])
processed = read_aozora_bunko_xml(path, features=['orth'], no_punc=True)
text = processed['text']
# 改行で分割し、一首ずつ整形
for line in text.split('\n'):
line = line.strip()
if len(line) > 10:
data.append({
'text': line,
'kana': '',
'romaji': '',
'author': author,
'era': '',
'collection': title,
'season': '',
'theme': '',
'notes': ''
})
return pd.DataFrame(data)
def main():
df_aozora = fetch_tanka_texts()
df_aozora.to_csv('waka_aozora.csv', index=False, encoding='utf-8')
print(f"青空文庫から短歌取得:{len(df_aozora)} 件 → waka_aozora.csv")
if __name__ == '__main__':
main()Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels