SPECCONVERTOR/PDF Scraper 3.py at main · koolbear-oss/SPECCONVERTOR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pdfquery
import pandas as pd
import tempfile

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfquery.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Combined text extracted from all pages.
    """

    # Open the PDF and create a pdfquery object
    pdf = pdfquery.PDFQuery(pdf_path)
    pdf.load()

    # List to store all extracted text
    extracted_text = []

    # Loop through all pages
    for page_num in range(len(pdf._pages)):
        # Set the current page
        pdf.page_number = page_num + 1

        # Define the text query (horizontal text on current page)
        text_query = pdf.pq('LTPage[page_index="{}"] LTTextLineHorizontal'.format(page_num))

        # Extract text and store it in a list
        page_text = []
        for element in text_query:
            page_text.append(element.text.strip())

        # Add page text to the combined list
        extracted_text.append(" ".join(page_text))  # Add a separator if needed

    # Combine all text into one string
    combined_text = " ".join(extracted_text)
    print(combined_text)

    return combined_text

def save_to_excel(extracted_text, excel_path):
  """
  Saves extracted and rewritten text to an Excel file using pandas with temporary file handling.

  Args:
      extracted_text (str): Extracted text from the PDF.
      rewritten_text (str): Rewritten text using GPT-3.
      excel_path (str): Path to the final Excel file.
  """

  # Create a dictionary with extracted and rewritten text
  data = {
      "Oorspronkelijke Tekst": [extracted_text],
      #"Herschreven Tekst": [rewritten_text]
  }

  # Create a DataFrame from the dictionary
  df = pd.DataFrame(data)

  # Save the DataFrame to an Excel file using pandas
  try:
      df.to_excel(excel_path, index=False)
      print(f"Bestanden succesvol opgeslagen in {excel_path}")
  except Exception as e:
      print(f"Fout bij opslaan in Excel: {e}")

if __name__ == "__main__":
  # Pad naar de PDF
  pdf_path = "C:/Users/matcol/Documents/Python/SPECCONVERTOR/technische fiche.pdf"

  # Pad waar het Excel-bestand zal worden opgeslagen
  excel_path = "C:/Users/matcol/Documents/Python/SPECCONVERTOR/technische fiche.xlsx"

  # Tekst extraheren uit de PDF
  extracted_text = extract_text_from_pdf(pdf_path)

  # Opslaan in Excel
  save_to_excel(extracted_text, excel_path)
  print("Tekst succesvol geëxtraheerd, opgeschoond en opgeslagen in Excel.")