-
Notifications
You must be signed in to change notification settings - Fork 16
Open
Description
Description
I tried to format several PDFs using the parse_paper_pdf_to_json function in utils/input_formatter.py, but the function failed to process some of them, so I reimplemented it.
import re
import json
from typing import List, Optional
from pathlib import Path
from pdfminer.high_level import extract_text
def check_str_regex(s: str) -> bool: # more than 3 digits and less than 10 characters
has_3_digits = bool(re.search(r'(?:.*\d){3,}', s))
letter_count = len(re.findall(r'[A-Za-z]', s))
return has_3_digits and (letter_count < 10)
def preprocess_lines_in_paragraphs(lines: list) -> list:
formatted_lines = []
buffer = []
for line in lines:
if line.strip(): # Non-empty -> the same paragraph
if line[-1] == '-':
buffer.append(line[:-1])
else:
buffer.append(line)
else: # Empty -> Next paragraph
if buffer: # Combine the content in buffer
formatted_lines.append("".join(buffer))
buffer = [] # Clean buffer
if buffer:
formatted_lines.append("".join(buffer))
return formatted_lines
def extract_paragraphs_from_pdf(pdf_path: Path, output_json_file: Path, filter_list: Optional[List[str]] = None):
# extract all the text from pdf
full_text = extract_text(pdf_path)
# split the text into lines
lines = full_text.splitlines()
# construct paragraphs based on the empty lines
formatted_lines = preprocess_lines_in_paragraphs(lines)
# only extract paragraphs between abstract and appendix
start = 0
try:
start = formatted_lines.index("Abstract")
except:
try:
start = formatted_lines.index("ABSTRACT")
except:
print("can not find abstract")
end = len(formatted_lines)
try:
end = formatted_lines.index("Appendix")
except:
try:
end = formatted_lines.index("APPENDIX")
except:
print("can not find appendix")
# the structured content and insert the title
structured_content = {
"Title": formatted_lines[1],
}
# start constructing the structured content
before_context = ""
current_section_idx = 0
current_subsection_idx = 0
# before_section = ""
current_section = ""
current_image_table = []
is_chapter = False
num_paragraph = 0
num_image_table = 0
for line in formatted_lines[start:end]:
if check_str_regex(line): # no more than 3 digits or at least 10 characters
continue
# check if before_context contains invalid content
is_filter = False
for text in filter_list:
if text in before_context:
is_filter = True
break
# before_context add into formatted context
if is_filter:
pass
elif is_chapter:
num_image_table = 0
num_paragraph = -1
is_chapter = False
elif before_context.startswith("Figure") or before_context.startswith("Table"):
current_image_table.append(before_context)
elif before_context != "":
if not before_context.isdigit() and before_context != "": # get rid of pure digit
if len(structured_content[current_section]) == 0:
num_image_table = 0
num_paragraph += 1
structured_content[current_section].append(before_context)
else:
char_end = structured_content[current_section][num_paragraph-num_image_table][-1]
is_append = True
if char_end != ".":
is_append = False
elif char_end == "." and (before_context[0].isdigit() and before_context[1] == "."): # 1. 2.
is_append = False
elif char_end == "." and (before_context[0] == "•"): # •
is_append = False
elif char_end == "." and (before_context[0].isdigit() and before_context[1] == ")"): # 1) 2)
is_append = False
elif char_end == "." and before_context[0] == "(": # (1), (information)
is_append = False
if is_append:
num_image_table = 0
num_paragraph += 1
structured_content[current_section].append(before_context)
if len(current_image_table) != 0:
num_image_table = len(current_image_table)
structured_content[current_section].extend(current_image_table)
current_image_table = []
else:
structured_content[current_section][num_paragraph-num_image_table] = structured_content[current_section][num_paragraph-num_image_table] + " " + before_context
# abstract
if line == "Abstract" or line == "ABSTRACT":
is_chapter = True
current_section = "Abstract"
structured_content[current_section] = []
# reference
if line == "REFERENCES" or line == "References":
is_chapter = True
# before_section = current_section
current_section = "References"
structured_content[current_section] = []
# chapter
if before_context.isdigit() and len(line) <= 20:
is_chapter = True
# before_section = current_section
current_section = before_context+" "+line
structured_content[current_section] = []
current_section_idx = current_section_idx + 1
current_subsection_idx = 0
if not line.isdigit() and line[0].isdigit() and line[1] == " ":
is_chapter = True
# before_section = current_section
current_section = line
structured_content[current_section] = []
current_section_idx = current_section_idx + 1
current_subsection_idx = 0
# sub-chapter
if not line.isdigit() and line[0] == str(current_section_idx) and line[1] == "." and line[2] == str(current_subsection_idx+1):
is_chapter = True
# before_section = current_section
current_section = line
structured_content[current_section] = []
current_subsection_idx = current_subsection_idx + 1
before_context = line
# output the structured content as json
with open(output_json_file, 'w') as json_file:
json.dump(structured_content, json_file, indent=4)
print("Successfully extract paragraphs from "+str(pdf_path)+" , and format as json file "+str(output_json_file))
if __name__ == "__main__":
pdf_path = "paper.pdf"
output_json_file = "data_paper.json"
filter_list = ["Under review as a conference paper at ICLR 2025", "Published as a conference paper at ICLR 2025"]
extract_paragraphs_from_pdf(pdf_path, output_json_file, filter_list)
Additional Information
No response
Metadata
Metadata
Assignees
Labels
No labels