csv抽出

```Python

import requests
import zipfile
import io
import re
import pandas as pd

# 📘 万葉集の書誌ID（CODH）
BOOK_ID = '200015542'

def fetch_manyoshu_zip():
    url = f'https://codh.rois.ac.jp/pmjt/book/{BOOK_ID}/download.zip'
    print(f"Downloading ZIP: {url}")
    res = requests.get(url)
    res.raise_for_status()
    return zipfile.ZipFile(io.BytesIO(res.content))

def extract_poems(z: zipfile.ZipFile):
    poems = []
    for name in z.namelist():
        if name.endswith('.txt'):
            raw = z.read(name).decode('utf-8')
            parts = re.split(r'\n(?=\d+)', raw)
            for part in parts:
                lines = part.strip().splitlines()
                if len(lines) >= 2 and re.match(r'^\d+', lines[0]):
                    text = ''.join(lines[1:]).strip()
                    poems.append(text)
    print(f"Extracted poems (raw): {len(poems)}")
    return poems

def to_dataframe(poems):
    df = pd.DataFrame({
        'text': poems,
        'kana': '',
        'romaji': '',
        'author': '不明',
        'era': '奈良',
        'collection': '万葉集',
        'season': '',
        'theme': '',
        'notes': ''
    })
    return df

def main():
    print("Fetching Manyoshu ZIP...")
    z = fetch_manyoshu_zip()
    print("Processing text files...")
    poems = extract_poems(z)
    df = to_dataframe(poems)
    output_path = 'waka_manyoshu.csv'
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"CSV output completed: {len(df)} rows → {output_path}")

if __name__ == '__main__':
    main()
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

csv抽出 #1

Sub-issues

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

csv抽出 #1

Description

Sub-issues

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions