Skip to content

青空文庫 #2

@9R0M

Description

@9R0M
import os
import pandas as pd
from aozora_corpus_generator.aozora import (
    read_aozora_bunko_list,
    read_aozora_bunko_xml
)

# 青空文庫リポジトリのクローン/パス
AOZORA_REPO = 'path/to/aozorabunko'  # 実際のローカルパスに置き換え
CSV_LIST = os.path.join(AOZORA_REPO, 'list_person_all_extended_utf8.zip')

def fetch_tanka_texts():
    # 全作品メタの読み込み
    aozora_db = read_aozora_bunko_list(CSV_LIST, ndc_map=None)
    data = []
    for author, works in aozora_db.items():
        for title, meta in works.items():
            if '短歌' in title or '和歌集' in title:
                path = os.path.join(AOZORA_REPO, meta['file_path'])
                processed = read_aozora_bunko_xml(path, features=['orth'], no_punc=True)
                text = processed['text']
                # 改行で分割し、一首ずつ整形
                for line in text.split('\n'):
                    line = line.strip()
                    if len(line) > 10:
                        data.append({
                            'text': line,
                            'kana': '',
                            'romaji': '',
                            'author': author,
                            'era': '',
                            'collection': title,
                            'season': '',
                            'theme': '',
                            'notes': ''
                        })
    return pd.DataFrame(data)

def main():
    df_aozora = fetch_tanka_texts()
    df_aozora.to_csv('waka_aozora.csv', index=False, encoding='utf-8')
    print(f"青空文庫から短歌取得:{len(df_aozora)} 件 → waka_aozora.csv")

if __name__ == '__main__':
    main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions