-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathdriver_code.py
More file actions
237 lines (192 loc) · 7.83 KB
/
driver_code.py
File metadata and controls
237 lines (192 loc) · 7.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from malicious_ip_filter import MaliciousIPFilter
from malicious_url_filter import MaliciousURLFilter
from search_indexer import run_crawler_simulation
from helper_functions import load_malicious_urls_from_csv, load_ips_from_text_file
import re
import ipaddress
def is_valid_url(url):
pattern = re.compile(r'^(https?://)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$')
return bool(pattern.match(url))
def is_valid_ip(ip):
try:
ipaddress.ip_address(ip)
return True
except ValueError:
return False
# --- Added imports for validation & canonicalization ---
import re
import ipaddress
from urllib.parse import urlparse, urlunparse, unquote
def canonicalize_url(raw_url: str) -> str:
"""
Canonicalize URL for consistent matching:
- Ensure scheme present (if missing, treat as invalid)
- Lowercase scheme and hostname
- Remove default ports (80 for http, 443 for https)
- Percent-decode the path/query where appropriate
- Strip fragments
Returns canonicalized URL string on success, or an empty string on failure.
"""
# ⏱️ Time Complexity: O(n), where n is the length of the URL string
try:
parsed = urlparse(raw_url)
except Exception:
return ""
# require explicit scheme and network location
if not parsed.scheme or not parsed.netloc:
return ""
scheme = parsed.scheme.lower()
if scheme not in ("http", "https"):
# we don't accept javascript:, data:, ftp:, etc. for canonicalization here
return ""
# Lowercase hostname (netloc may include credentials or port)
netloc = parsed.netloc
# Reject credentials embedded in URL (user:pass@host)
if "@" in netloc:
return ""
# Separate hostname and optional port
host = netloc
port = None
if ":" in netloc:
host_part, port_part = netloc.rsplit(":", 1)
# If port_part isn't numeric, treat whole netloc as hostname (IPv6 cases will have brackets)
if port_part.isdigit():
host = host_part
port = int(port_part)
host = host.lower()
# Remove default ports
if (scheme == "http" and port == 80) or (scheme == "https" and port == 443):
port = None
if port:
netloc = f"{host}:{port}"
else:
netloc = host
# percent-decode path and query for canonical comparison
path = unquote(parsed.path or "")
query = unquote(parsed.query or "")
# strip fragment (parsed.fragment)
fragment = ""
canonical = urlunparse((scheme, netloc, path or '/', '', query, fragment))
return canonical
def is_valid_url_format(raw_url: str) -> bool:
"""
Lightweight syntactic checks for URL before canonicalization:
- not empty
- allowed scheme (http/https)
- no suspicious patterns like javascript:, data:, embedded credentials, or obvious binary downloads
"""
# ⏱️ Time Complexity: O(n), where n is the length of the URL string
if not raw_url or not raw_url.strip():
return False
raw_url = raw_url.strip()
# quick reject of clearly malicious schemes
if re.search(r"^\s*(javascript:|data:|vbscript:)", raw_url, re.IGNORECASE):
return False
# tiny sanity: must contain :// for a scheme-based URL
if "://" not in raw_url:
return False
# Reject common suspicious substrings
suspicious_substrings = ["@", "base64", "\\.exe", "\\.zip", "\\.js"]
for s in suspicious_substrings:
if re.search(s, raw_url, re.IGNORECASE):
# note: this is a conservative check — adjust if you want fewer false positives
return False
# Try parsing
parsed = urlparse(raw_url)
if not parsed.scheme or not parsed.netloc:
return False
if parsed.scheme.lower() not in ("http", "https"):
return False
return True
def is_valid_ip_format(ip_str: str) -> bool:
"""
Validate IPv4 or IPv6 address using ipaddress module. Returns True if valid.
"""
# ⏱️ Time Complexity: O(1)
try:
# ip_address will raise ValueError for invalid addresses
ipaddress.ip_address(ip_str)
return True
except ValueError:
return False
def run_interactive_checker():
"""
Main function to run the interactive malicious URL checker with real data.
"""
# ⏱️ Time Complexity:
# - Loading data: O(U + I), where U = number of URLs, I = number of IPs
# - Populating Bloom filters: O(U + I)
# - Each lookup/query: O(k), where k = number of hash functions in Bloom filter (typically constant)
# - Overall interactive loop: O(U + I + Q), where Q = number of user queries
# Therefore total complexity ≈ O(U + I + Q)
# Step 1: Load the data
# The filename should match what you saved the CSV as.
URL_DATASET_FILENAME = 'datasets/malicious_urls.csv'
IP_DATASET_FILENAME = 'datasets/bad_ip_dataset.txt'
known_malicious_urls = load_malicious_urls_from_csv(URL_DATASET_FILENAME)
known_malicious_ips = load_ips_from_text_file(IP_DATASET_FILENAME)
if not known_malicious_urls or not known_malicious_ips:
print("Error: Could not load malicious datasets. Please verify dataset paths.")
return
EXPECTED_URLS_IN_BLACKLIST = len(known_malicious_urls)
EXPECTED_IPS_IN_BLACKLIST = len(known_malicious_ips)
# Set how much chance we’re okay with for a “false alarm”
DESIRED_FP_PROBABILITY = 0.01
url_filter = MaliciousURLFilter(EXPECTED_URLS_IN_BLACKLIST, DESIRED_FP_PROBABILITY)
ip_filter = MaliciousIPFilter(known_malicious_ips, DESIRED_FP_PROBABILITY)
for url in known_malicious_urls:
url_filter.add(url.strip().lower())
for ip in known_malicious_ips:
ip_filter.add(ip.strip().lower())
while True:
print("\n==============================")
print(" Malicious Checker Menu ")
print("==============================")
print("1. Check for malicious URL")
print("2. Check for malicious IP")
print("3. Check for website")
print("4. Exit")
try:
choice = int(input("Enter your choice: ").strip())
except ValueError:
print("Invalid input. Please enter a number between 1 and 4.")
continue
if choice == 1:
print("\n--- Malicious URL Checker ---")
while True:
user_input = input("Enter a URL (or type 'exit' to go back): ").strip().lower()
if user_input == 'exit':
break
if not user_input:
continue
if not is_valid_url(user_input):
print("The URL should be in the format: https://example.com")
continue
if user_input in url_filter:
print(f"Warning: '{user_input}' is probably malicious.")
else:
print(f"'{user_input}' is not in the blacklist.")
elif choice == 2:
print("\n--- Malicious IP Checker ---")
while True:
user_input = input("Enter an IP (or type 'exit' to go back): ").strip().lower()
if user_input == 'exit':
break
if not user_input:
continue
if not is_valid_ip(user_input):
print("Invalid IP format. Please enter a valid IPv4 or IPv6 address.")
continue
if user_input in ip_filter:
print(f"Warning: '{user_input}' is probably malicious.")
else:
print(f"'{user_input}' is not in the blacklist.")
elif choice == 3:
run_crawler_simulation()
elif choice == 4:
print("Exiting program.")
break
else:
print("Invalid choice. Please select between 1 and 4.")
if __name__ == "__main__":
run_interactive_checker()