-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfotexts.py
More file actions
49 lines (38 loc) · 1.65 KB
/
fotexts.py
File metadata and controls
49 lines (38 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import scrapy
class FOSpiderTexts(scrapy.Spider):
name = "fotexts"
start_urls = [
"http://fyzikalniolympiada.cz/studijni-texty",
]
def parse(self, response):
for row in response.css('table.texty-arch tr'):
# Získání textu před a uvnitř <strong>
before_strong = row.css('td::text').get()
inside_strong = row.css('td strong::text').get()
# Spojení textu před a uvnitř <strong>
full_title = f"{before_strong.strip() if before_strong else ''} {inside_strong.strip() if inside_strong else ''}".strip()
pdf_link = row.css('a i.pdf::text').get()
if pdf_link:
# Hledáme odkazy na PDF
pdf_url = row.css('a[href$=".pdf"]::attr(href)').get()
if pdf_url:
yield { # Získáváme data
'title': full_title,
'pdf_url': response.urljoin(pdf_url)
}
yield scrapy.Request( # Stahujeme PDF
url=response.urljoin(pdf_url),
callback=self.save_pdf,
meta={'title': full_title if full_title else 'No title'}
)
def save_pdf(self, response):
"""Stahuje PDF soubor"""
title = response.meta['title']
filename = f"{title}.pdf"
download_dir = "Texty/"
if not os.path.exists(download_dir):
os.makedirs(download_dir)
with open(download_dir + filename, 'wb') as f:
f.write(response.body)
self.log(f"Saved file {filename}")