Skip to content

[ORG]: Enhance function parse_paper_pdf_to_json in file utils/input_formatter.py #274

@JingjunXu

Description

@JingjunXu

Description

I tried to format several PDFs using the parse_paper_pdf_to_json function in utils/input_formatter.py, but the function failed to process some of them, so I reimplemented it.

import re
import json
from typing import List, Optional
from pathlib import Path
from pdfminer.high_level import extract_text

def check_str_regex(s: str) -> bool: # more than 3 digits and less than 10 characters
    has_3_digits = bool(re.search(r'(?:.*\d){3,}', s))
    letter_count = len(re.findall(r'[A-Za-z]', s))
    return has_3_digits and (letter_count < 10)

def preprocess_lines_in_paragraphs(lines: list) -> list:
    formatted_lines = []
    buffer = []
    for line in lines:
        if line.strip(): # Non-empty -> the same paragraph
            if line[-1] == '-':
                buffer.append(line[:-1])
            else:
                buffer.append(line)
        else: # Empty -> Next paragraph
            if buffer: # Combine the content in buffer
                formatted_lines.append("".join(buffer))
                buffer = [] # Clean buffer
    if buffer:
        formatted_lines.append("".join(buffer))
        
    return formatted_lines

def extract_paragraphs_from_pdf(pdf_path: Path, output_json_file: Path, filter_list: Optional[List[str]] = None):
    # extract all the text from pdf
    full_text = extract_text(pdf_path)
    
    # split the text into lines
    lines = full_text.splitlines()
    
    # construct paragraphs based on the empty lines
    formatted_lines = preprocess_lines_in_paragraphs(lines)
    
    # only extract paragraphs between abstract and appendix
    start = 0
    try:
        start = formatted_lines.index("Abstract")
    except:
        try:
            start = formatted_lines.index("ABSTRACT")
        except:
            print("can not find abstract")
    end = len(formatted_lines)
    try:
        end = formatted_lines.index("Appendix")
    except:
        try:
            end = formatted_lines.index("APPENDIX")
        except:
            print("can not find appendix")

    # the structured content and insert the title
    structured_content = {
        "Title": formatted_lines[1],
    }
    
    # start constructing the structured content
    before_context = ""
    current_section_idx = 0
    current_subsection_idx = 0
    # before_section = ""
    current_section = ""
    current_image_table = []
    is_chapter = False
    num_paragraph = 0
    num_image_table = 0
    for line in formatted_lines[start:end]:
        if check_str_regex(line): # no more than 3 digits or at least 10 characters
            continue
        
        # check if before_context contains invalid content
        is_filter = False
        for text in filter_list:
            if text in before_context:
                is_filter = True
                break
        
        # before_context add into formatted context
        if is_filter:
            pass
        elif is_chapter:
            num_image_table = 0
            num_paragraph = -1
            is_chapter = False
        elif before_context.startswith("Figure") or before_context.startswith("Table"):
            current_image_table.append(before_context)
        elif before_context != "":
            if not before_context.isdigit() and before_context != "": # get rid of pure digit
                if len(structured_content[current_section]) == 0:
                    num_image_table = 0
                    num_paragraph += 1
                    structured_content[current_section].append(before_context)
                else:
                    char_end = structured_content[current_section][num_paragraph-num_image_table][-1]
                    
                    is_append = True
                    if char_end != ".":
                        is_append = False
                    elif char_end == "." and (before_context[0].isdigit() and before_context[1] == "."): # 1. 2.
                        is_append = False
                    elif char_end == "." and (before_context[0] == "•"): # •
                        is_append = False
                    elif char_end == "." and (before_context[0].isdigit() and before_context[1] == ")"): # 1) 2)
                        is_append = False
                    elif char_end == "." and before_context[0] == "(": # (1), (information)
                        is_append = False

                    if is_append:
                        num_image_table = 0
                        num_paragraph += 1
                        structured_content[current_section].append(before_context)
                        if len(current_image_table) != 0:
                            num_image_table = len(current_image_table)
                            structured_content[current_section].extend(current_image_table)
                            current_image_table = []
                    else:
                        structured_content[current_section][num_paragraph-num_image_table] = structured_content[current_section][num_paragraph-num_image_table] + " " + before_context
                
        # abstract
        if line == "Abstract" or line == "ABSTRACT":
            is_chapter = True
            current_section = "Abstract"
            structured_content[current_section] = []
        # reference
        if line == "REFERENCES" or line == "References":
            is_chapter = True
            # before_section = current_section
            current_section = "References"
            structured_content[current_section] = []
        # chapter
        if before_context.isdigit() and len(line) <= 20:
            is_chapter = True
            # before_section = current_section
            current_section = before_context+" "+line
            structured_content[current_section] = []
            current_section_idx = current_section_idx + 1
            current_subsection_idx = 0
        if not line.isdigit() and line[0].isdigit() and line[1] == " ":
            is_chapter = True
            # before_section = current_section
            current_section = line
            structured_content[current_section] = []
            current_section_idx = current_section_idx + 1
            current_subsection_idx = 0
        # sub-chapter
        if not line.isdigit() and line[0] == str(current_section_idx) and line[1] == "." and line[2] == str(current_subsection_idx+1):
            is_chapter = True
            # before_section = current_section
            current_section = line
            structured_content[current_section] = []
            current_subsection_idx = current_subsection_idx + 1

        before_context = line
    
    # output the structured content as json
    with open(output_json_file, 'w') as json_file:
        json.dump(structured_content, json_file, indent=4)
    
    print("Successfully extract paragraphs from "+str(pdf_path)+" , and format as json file "+str(output_json_file))
    
if __name__ == "__main__":
    pdf_path = "paper.pdf"
    output_json_file = "data_paper.json"
    filter_list = ["Under review as a conference paper at ICLR 2025", "Published as a conference paper at ICLR 2025"]

    extract_paragraphs_from_pdf(pdf_path, output_json_file, filter_list)

Additional Information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions