-
Notifications
You must be signed in to change notification settings - Fork 0
Open
0 / 10 of 1 issue completedDescription
import requests
import zipfile
import io
import re
import pandas as pd
# 📘 万葉集の書誌ID(CODH)
BOOK_ID = '200015542'
def fetch_manyoshu_zip():
url = f'https://codh.rois.ac.jp/pmjt/book/{BOOK_ID}/download.zip'
print(f"Downloading ZIP: {url}")
res = requests.get(url)
res.raise_for_status()
return zipfile.ZipFile(io.BytesIO(res.content))
def extract_poems(z: zipfile.ZipFile):
poems = []
for name in z.namelist():
if name.endswith('.txt'):
raw = z.read(name).decode('utf-8')
parts = re.split(r'\n(?=\d+)', raw)
for part in parts:
lines = part.strip().splitlines()
if len(lines) >= 2 and re.match(r'^\d+', lines[0]):
text = ''.join(lines[1:]).strip()
poems.append(text)
print(f"Extracted poems (raw): {len(poems)}")
return poems
def to_dataframe(poems):
df = pd.DataFrame({
'text': poems,
'kana': '',
'romaji': '',
'author': '不明',
'era': '奈良',
'collection': '万葉集',
'season': '',
'theme': '',
'notes': ''
})
return df
def main():
print("Fetching Manyoshu ZIP...")
z = fetch_manyoshu_zip()
print("Processing text files...")
poems = extract_poems(z)
df = to_dataframe(poems)
output_path = 'waka_manyoshu.csv'
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"CSV output completed: {len(df)} rows → {output_path}")
if __name__ == '__main__':
main()Reactions are currently unavailable
Sub-issues
Metadata
Metadata
Assignees
Labels
No labels