[ORG]: Enhance function parse_paper_pdf_to_json in file utils/input_formatter.py

### Description


I tried to format several PDFs using the **parse_paper_pdf_to_json** function in **utils/input_formatter.py**, but the function failed to process some of them, so I reimplemented it.

```
import re
import json
from typing import List, Optional
from pathlib import Path
from pdfminer.high_level import extract_text

def check_str_regex(s: str) -> bool: # more than 3 digits and less than 10 characters
    has_3_digits = bool(re.search(r'(?:.*\d){3,}', s))
    letter_count = len(re.findall(r'[A-Za-z]', s))
    return has_3_digits and (letter_count < 10)

def preprocess_lines_in_paragraphs(lines: list) -> list:
    formatted_lines = []
    buffer = []
    for line in lines:
        if line.strip(): # Non-empty -> the same paragraph
            if line[-1] == '-':
                buffer.append(line[:-1])
            else:
                buffer.append(line)
        else: # Empty -> Next paragraph
            if buffer: # Combine the content in buffer
                formatted_lines.append("".join(buffer))
                buffer = [] # Clean buffer
    if buffer:
        formatted_lines.append("".join(buffer))
        
    return formatted_lines

def extract_paragraphs_from_pdf(pdf_path: Path, output_json_file: Path, filter_list: Optional[List[str]] = None):
    # extract all the text from pdf
    full_text = extract_text(pdf_path)
    
    # split the text into lines
    lines = full_text.splitlines()
    
    # construct paragraphs based on the empty lines
    formatted_lines = preprocess_lines_in_paragraphs(lines)
    
    # only extract paragraphs between abstract and appendix
    start = 0
    try:
        start = formatted_lines.index("Abstract")
    except:
        try:
            start = formatted_lines.index("ABSTRACT")
        except:
            print("can not find abstract")
    end = len(formatted_lines)
    try:
        end = formatted_lines.index("Appendix")
    except:
        try:
            end = formatted_lines.index("APPENDIX")
        except:
            print("can not find appendix")

    # the structured content and insert the title
    structured_content = {
        "Title": formatted_lines[1],
    }
    
    # start constructing the structured content
    before_context = ""
    current_section_idx = 0
    current_subsection_idx = 0
    # before_section = ""
    current_section = ""
    current_image_table = []
    is_chapter = False
    num_paragraph = 0
    num_image_table = 0
    for line in formatted_lines[start:end]:
        if check_str_regex(line): # no more than 3 digits or at least 10 characters
            continue
        
        # check if before_context contains invalid content
        is_filter = False
        for text in filter_list:
            if text in before_context:
                is_filter = True
                break
        
        # before_context add into formatted context
        if is_filter:
            pass
        elif is_chapter:
            num_image_table = 0
            num_paragraph = -1
            is_chapter = False
        elif before_context.startswith("Figure") or before_context.startswith("Table"):
            current_image_table.append(before_context)
        elif before_context != "":
            if not before_context.isdigit() and before_context != "": # get rid of pure digit
                if len(structured_content[current_section]) == 0:
                    num_image_table = 0
                    num_paragraph += 1
                    structured_content[current_section].append(before_context)
                else:
                    char_end = structured_content[current_section][num_paragraph-num_image_table][-1]
                    
                    is_append = True
                    if char_end != ".":
                        is_append = False
                    elif char_end == "." and (before_context[0].isdigit() and before_context[1] == "."): # 1. 2.
                        is_append = False
                    elif char_end == "." and (before_context[0] == "•"): # •
                        is_append = False
                    elif char_end == "." and (before_context[0].isdigit() and before_context[1] == ")"): # 1) 2)
                        is_append = False
                    elif char_end == "." and before_context[0] == "(": # (1), (information)
                        is_append = False

                    if is_append:
                        num_image_table = 0
                        num_paragraph += 1
                        structured_content[current_section].append(before_context)
                        if len(current_image_table) != 0:
                            num_image_table = len(current_image_table)
                            structured_content[current_section].extend(current_image_table)
                            current_image_table = []
                    else:
                        structured_content[current_section][num_paragraph-num_image_table] = structured_content[current_section][num_paragraph-num_image_table] + " " + before_context
                
        # abstract
        if line == "Abstract" or line == "ABSTRACT":
            is_chapter = True
            current_section = "Abstract"
            structured_content[current_section] = []
        # reference
        if line == "REFERENCES" or line == "References":
            is_chapter = True
            # before_section = current_section
            current_section = "References"
            structured_content[current_section] = []
        # chapter
        if before_context.isdigit() and len(line) <= 20:
            is_chapter = True
            # before_section = current_section
            current_section = before_context+" "+line
            structured_content[current_section] = []
            current_section_idx = current_section_idx + 1
            current_subsection_idx = 0
        if not line.isdigit() and line[0].isdigit() and line[1] == " ":
            is_chapter = True
            # before_section = current_section
            current_section = line
            structured_content[current_section] = []
            current_section_idx = current_section_idx + 1
            current_subsection_idx = 0
        # sub-chapter
        if not line.isdigit() and line[0] == str(current_section_idx) and line[1] == "." and line[2] == str(current_subsection_idx+1):
            is_chapter = True
            # before_section = current_section
            current_section = line
            structured_content[current_section] = []
            current_subsection_idx = current_subsection_idx + 1

        before_context = line
    
    # output the structured content as json
    with open(output_json_file, 'w') as json_file:
        json.dump(structured_content, json_file, indent=4)
    
    print("Successfully extract paragraphs from "+str(pdf_path)+" , and format as json file "+str(output_json_file))
    
if __name__ == "__main__":
    pdf_path = "paper.pdf"
    output_json_file = "data_paper.json"
    filter_list = ["Under review as a conference paper at ICLR 2025", "Published as a conference paper at ICLR 2025"]

    extract_paragraphs_from_pdf(pdf_path, output_json_file, filter_list)
```

### Additional Information

_No response_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[ORG]: Enhance function parse_paper_pdf_to_json in file utils/input_formatter.py #274

Description

Additional Information

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[ORG]: Enhance function parse_paper_pdf_to_json in file utils/input_formatter.py #274

Description

Description

Additional Information

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions