-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtempCodeRunnerFile.python
More file actions
44 lines (36 loc) · 1.54 KB
/
tempCodeRunnerFile.python
File metadata and controls
44 lines (36 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import fitz # PyMuPDF
import requests
import os
from bs4 import BeautifulSoup
# Function to download a PDF file
def download_pdf(url, folder_path):
response = requests.get(url)
filename = os.path.join(folder_path, url.split("/")[-1])
with open(filename, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded: {filename}")
# Function to extract hyperlinks from a PDF
def extract_links_from_pdf(pdf_path):
links = []
with fitz.open(pdf_path) as pdf_document:
for page_number in range(pdf_document.page_count):
page = pdf_document[page_number]
annotations = page.get_links()
for annotation in annotations:
links.append(annotation.get('uri'))
return links
# Function to download PDFs from extracted hyperlinks
def download_pdfs_from_links(links, folder_path):
for link in links:
if link and link.endswith('.pdf'):
download_pdf(link, folder_path)
# Specify the path to the PDF file you want to process
pdf_path = r"F:\UCSC\Second Year\First Sem\Group Project\Data Bases\Ticket Fare\main_line.pdf"
# Specify the folder where you want to save the downloaded PDFs
download_folder = r"F:\UCSC\Second Year\First Sem\Group Project\Data Bases\Ticket Fare\Main Line\All PDFs"
# Create the download folder if it doesn't exist
os.makedirs(download_folder, exist_ok=True)
# Extract hyperlinks from the PDF
pdf_links = extract_links_from_pdf(pdf_path)
# Download PDFs from the extracted hyperlinks
download_pdfs_from_links(pdf_links, download_folder)