-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_corpus.py
More file actions
109 lines (86 loc) · 3.56 KB
/
build_corpus.py
File metadata and controls
109 lines (86 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import argparse
import logging
from pathlib import Path
from datasets import Dataset, Features, Image, Value
from tqdm import tqdm
from kovidore_data_generator.parser import UpstageDocumentParser
from kovidore_data_generator.config import ALL_SUBSETS
logger = logging.getLogger(__name__)
def main(args):
# You can replace UpstageDocumentParser with any other parser you have implemented
parser = UpstageDocumentParser()
data_dir = Path(__file__).parent / "data"
for subset_name in args.subsets:
subset_dir = data_dir / subset_name
pdfs_dir = subset_dir / "pdfs"
images_dir = subset_dir / "images"
if not pdfs_dir.exists():
logger.warning(f"No pdfs directory found in {subset_name}")
continue
pdf_files = list(pdfs_dir.glob("*.pdf"))
if not pdf_files:
logger.warning(f"No PDF files found in {pdfs_dir}")
continue
logger.info(f"Processing subset: {subset_name} ({len(pdf_files)} PDF files)")
data = []
corpus_id = 0
for pdf_file in pdf_files:
doc_id = pdf_file.stem
try:
parser.convert_pdf_to_image(
pdf_path=str(pdf_file),
output_folder=str(images_dir),
)
image_files = sorted(
images_dir.glob(f"{doc_id}_*.png"),
key=lambda x: int(x.name.split("_")[-1].replace(".png", "")),
)
for image_file in tqdm(image_files, desc=f"Parsing images"):
page_number = int(image_file.name.split("_")[-1].replace(".png", ""))
try:
result = parser.parse_document(str(image_file))
markdown = result.get("content", "")
elements = result.get("elements", "")
except Exception as e:
logger.error(f"Error parsing {image_file}: {e}")
markdown = ""
elements = ""
data.append(
{
"corpus_id": corpus_id,
"image": str(image_file),
"doc_id": doc_id,
"markdown": markdown,
"elements": elements,
"page_number_in_doc": page_number,
}
)
corpus_id += 1
except Exception as e:
logger.error(f"Error converting {pdf_file}: {e}")
if data:
corpus_path = subset_dir / "corpus" / "data.parquet"
features = Features(
{
"corpus_id": Value("int64"),
"image": Image(),
"doc_id": Value("string"),
"markdown": Value("string"),
"elements": Value("string"),
"page_number_in_doc": Value("int64"),
}
)
dataset = Dataset.from_list(data, features=features)
dataset.to_parquet(corpus_path)
logger.info(f"Saved parquet to {corpus_path} ({len(data)} rows)")
logger.info("All subsets completed!")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--subsets",
nargs="*",
default=ALL_SUBSETS,
help=f"Subsets to run (default: all subsets). Available subsets: {', '.join(ALL_SUBSETS)}",
)
args = parser.parse_args()
main(args)