-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPDF Scraper 3.py
More file actions
83 lines (63 loc) · 2.42 KB
/
PDF Scraper 3.py
File metadata and controls
83 lines (63 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pdfquery
import pandas as pd
import tempfile
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a PDF file using pdfquery.
Args:
pdf_path (str): Path to the PDF file.
Returns:
str: Combined text extracted from all pages.
"""
# Open the PDF and create a pdfquery object
pdf = pdfquery.PDFQuery(pdf_path)
pdf.load()
# List to store all extracted text
extracted_text = []
# Loop through all pages
for page_num in range(len(pdf._pages)):
# Set the current page
pdf.page_number = page_num + 1
# Define the text query (horizontal text on current page)
text_query = pdf.pq('LTPage[page_index="{}"] LTTextLineHorizontal'.format(page_num))
# Extract text and store it in a list
page_text = []
for element in text_query:
page_text.append(element.text.strip())
# Add page text to the combined list
extracted_text.append(" ".join(page_text)) # Add a separator if needed
# Combine all text into one string
combined_text = " ".join(extracted_text)
print(combined_text)
return combined_text
def save_to_excel(extracted_text, excel_path):
"""
Saves extracted and rewritten text to an Excel file using pandas with temporary file handling.
Args:
extracted_text (str): Extracted text from the PDF.
rewritten_text (str): Rewritten text using GPT-3.
excel_path (str): Path to the final Excel file.
"""
# Create a dictionary with extracted and rewritten text
data = {
"Oorspronkelijke Tekst": [extracted_text],
#"Herschreven Tekst": [rewritten_text]
}
# Create a DataFrame from the dictionary
df = pd.DataFrame(data)
# Save the DataFrame to an Excel file using pandas
try:
df.to_excel(excel_path, index=False)
print(f"Bestanden succesvol opgeslagen in {excel_path}")
except Exception as e:
print(f"Fout bij opslaan in Excel: {e}")
if __name__ == "__main__":
# Pad naar de PDF
pdf_path = "C:/Users/matcol/Documents/Python/SPECCONVERTOR/technische fiche.pdf"
# Pad waar het Excel-bestand zal worden opgeslagen
excel_path = "C:/Users/matcol/Documents/Python/SPECCONVERTOR/technische fiche.xlsx"
# Tekst extraheren uit de PDF
extracted_text = extract_text_from_pdf(pdf_path)
# Opslaan in Excel
save_to_excel(extracted_text, excel_path)
print("Tekst succesvol geëxtraheerd, opgeschoond en opgeslagen in Excel.")