ProthPrime-ML-XGBoost/get_prothsearch.py at main · VesterlundCoder/ProthPrime-ML-XGBoost · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Lista med URL:er
urls = [
    "http://www.prothsearch.com/riesel1c.html",
    "http://www.prothsearch.com/riesel1b.html",
    "http://www.prothsearch.com/riesel1a.html",
    "http://www.prothsearch.com/riesel1.html"
]

# Funktion för att extrahera Proth-primtal från en URL
def extract_proth_primes(url):
    import re
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    data = []
    font_tags = soup.find_all('font', attrs={'color': 'Blue'})
    k = None
    for idx, tag in enumerate(font_tags):
        k_tag = tag.find('b')
        if k_tag and k_tag.text.strip().isdigit():
            k = int(k_tag.text.strip())
        else:
            continue
        # Collect n values from the next siblings until the next <font color="Blue">
        current = tag.next_sibling
        while current:
            if getattr(current, 'name', None) == 'font' and current.get('color') == 'Blue':
                break
            if hasattr(current, 'get_text'):
                text = current.get_text()
            else:
                text = str(current)
            if '[' in text:
                text = text.split('[')[0]
            nums = re.findall(r'\d+', text)
            for n in nums:
                try:
                    n_int = int(n)
                    data.append({'k': k, 'n': n_int})
                except Exception:
                    continue
            current = current.next_sibling
    return data

# Samla all data
all_data = []
for url in urls:
    print(f"Hämtar data från: {url}")
    all_data.extend(extract_proth_primes(url))

# Konvertera till DataFrame och spara
df = pd.DataFrame(all_data)
if not df.empty:
    df = df[['k', 'n']]
    df.to_csv("prothsearch_primtal.csv", index=False)
    print(f"✅ Sparat {len(df)} rader till 'prothsearch_primtal.csv'")
else:
    print("❌ Inga Proth-primaler hittades och sparades.")