Skip to content

csv抽出 #1

@9R0M

Description

@9R0M
import requests
import zipfile
import io
import re
import pandas as pd

# 📘 万葉集の書誌ID(CODH)
BOOK_ID = '200015542'

def fetch_manyoshu_zip():
    url = f'https://codh.rois.ac.jp/pmjt/book/{BOOK_ID}/download.zip'
    print(f"Downloading ZIP: {url}")
    res = requests.get(url)
    res.raise_for_status()
    return zipfile.ZipFile(io.BytesIO(res.content))

def extract_poems(z: zipfile.ZipFile):
    poems = []
    for name in z.namelist():
        if name.endswith('.txt'):
            raw = z.read(name).decode('utf-8')
            parts = re.split(r'\n(?=\d+)', raw)
            for part in parts:
                lines = part.strip().splitlines()
                if len(lines) >= 2 and re.match(r'^\d+', lines[0]):
                    text = ''.join(lines[1:]).strip()
                    poems.append(text)
    print(f"Extracted poems (raw): {len(poems)}")
    return poems

def to_dataframe(poems):
    df = pd.DataFrame({
        'text': poems,
        'kana': '',
        'romaji': '',
        'author': '不明',
        'era': '奈良',
        'collection': '万葉集',
        'season': '',
        'theme': '',
        'notes': ''
    })
    return df

def main():
    print("Fetching Manyoshu ZIP...")
    z = fetch_manyoshu_zip()
    print("Processing text files...")
    poems = extract_poems(z)
    df = to_dataframe(poems)
    output_path = 'waka_manyoshu.csv'
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"CSV output completed: {len(df)} rows → {output_path}")

if __name__ == '__main__':
    main()

Sub-issues

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions