-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
59 lines (45 loc) · 2.13 KB
/
main.py
File metadata and controls
59 lines (45 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from app.scraper import scrape_with_requests, scrape_with_selenium
from app.utils import load_selectors, check_url, extract_article_data, save_to_csv, extract_article_links
from bs4 import BeautifulSoup
import argparse, os
from datetime import datetime
def main():
parser = argparse.ArgumentParser(description="Asphalt scraper.")
parser.add_argument("url", nargs="?", help="The URL to process")
parser.add_argument("-a", action="store_true", help="If scraping for a single (a)rticle")
parser.add_argument("-b", action="store_true", help="If scraping for a (b)log")
parser.add_argument("--list", type=str, help="Path to txt file with URLs")
parser.add_argument("--selectors", type=str, help="Selectors identifier (e.g. ghost, medium, etc.)")
args = parser.parse_args()
# Get URLs
urls = []
if args.list:
with open(args.list, "r") as f:
urls = [line.strip() for line in f.readlines()]
elif args.url:
urls = [args.url]
else:
parser.error("No URL or --list provided.")
# Load selectors first
list_selectors = [f.replace('.json', '') for f in os.listdir('app/selectors')]
selectors_url = f'app/selectors/{args.selectors}.json' if args.selectors and args.selectors in list_selectors else 'app/selectors/selectors.json'
selectors = load_selectors(selectors_url)
if not selectors:
parser.error("No selectors provided. Check the selectors file.")
site_type = 'a' if args.a else 'b' if args.b else None
if not site_type:
parser.error("No site type provided. Use -a for a single article or -b for a blog page.")
articles_links = []
if site_type == 'b':
for url in urls:
articles_links.extend(extract_article_links(url, selectors))
else:
articles_links = urls
articles = []
for link in articles_links:
articles.append( extract_article_data( link, selectors ) )
filename = f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.csv"
print(f"Found {len(articles)} articles. Saving to {filename}...")
save_to_csv(articles, filename)
if __name__ == "__main__":
main()