DSA_Case_Study/driver_code.py at main · jimitnick/DSA_Case_Study · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from malicious_ip_filter import MaliciousIPFilter
from malicious_url_filter import MaliciousURLFilter
from search_indexer import run_crawler_simulation
from helper_functions import load_malicious_urls_from_csv, load_ips_from_text_file
import re
import ipaddress

def is_valid_url(url):
    pattern = re.compile(r'^(https?://)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(/.*)?$')
    return bool(pattern.match(url))

def is_valid_ip(ip):
    try:
        ipaddress.ip_address(ip)
        return True
    except ValueError:
        return False

# --- Added imports for validation & canonicalization ---
import re
import ipaddress
from urllib.parse import urlparse, urlunparse, unquote


def canonicalize_url(raw_url: str) -> str:
    """
    Canonicalize URL for consistent matching:
      - Ensure scheme present (if missing, treat as invalid)
      - Lowercase scheme and hostname
      - Remove default ports (80 for http, 443 for https)
      - Percent-decode the path/query where appropriate
      - Strip fragments

    Returns canonicalized URL string on success, or an empty string on failure.
    """
    # ⏱️ Time Complexity: O(n), where n is the length of the URL string
    try:
        parsed = urlparse(raw_url)
    except Exception:
        return ""

    # require explicit scheme and network location
    if not parsed.scheme or not parsed.netloc:
        return ""

    scheme = parsed.scheme.lower()
    if scheme not in ("http", "https"):
        # we don't accept javascript:, data:, ftp:, etc. for canonicalization here
        return ""

    # Lowercase hostname (netloc may include credentials or port)
    netloc = parsed.netloc

    # Reject credentials embedded in URL (user:pass@host)
    if "@" in netloc:
        return ""

    # Separate hostname and optional port
    host = netloc
    port = None
    if ":" in netloc:
        host_part, port_part = netloc.rsplit(":", 1)
        # If port_part isn't numeric, treat whole netloc as hostname (IPv6 cases will have brackets)
        if port_part.isdigit():
            host = host_part
            port = int(port_part)

    host = host.lower()

    # Remove default ports
    if (scheme == "http" and port == 80) or (scheme == "https" and port == 443):
        port = None

    if port:
        netloc = f"{host}:{port}"
    else:
        netloc = host

    # percent-decode path and query for canonical comparison
    path = unquote(parsed.path or "")
    query = unquote(parsed.query or "")

    # strip fragment (parsed.fragment)
    fragment = ""

    canonical = urlunparse((scheme, netloc, path or '/', '', query, fragment))
    return canonical


def is_valid_url_format(raw_url: str) -> bool:
    """
    Lightweight syntactic checks for URL before canonicalization:
      - not empty
      - allowed scheme (http/https)
      - no suspicious patterns like javascript:, data:, embedded credentials, or obvious binary downloads
    """
    # ⏱️ Time Complexity: O(n), where n is the length of the URL string
    if not raw_url or not raw_url.strip():
        return False

    raw_url = raw_url.strip()

    # quick reject of clearly malicious schemes
    if re.search(r"^\s*(javascript:|data:|vbscript:)", raw_url, re.IGNORECASE):
        return False

    # tiny sanity: must contain :// for a scheme-based URL
    if "://" not in raw_url:
        return False

    # Reject common suspicious substrings
    suspicious_substrings = ["@", "base64", "\\.exe", "\\.zip", "\\.js"]
    for s in suspicious_substrings:
        if re.search(s, raw_url, re.IGNORECASE):
            # note: this is a conservative check — adjust if you want fewer false positives
            return False

    # Try parsing
    parsed = urlparse(raw_url)
    if not parsed.scheme or not parsed.netloc:
        return False

    if parsed.scheme.lower() not in ("http", "https"):
        return False

    return True


def is_valid_ip_format(ip_str: str) -> bool:
    """
    Validate IPv4 or IPv6 address using ipaddress module. Returns True if valid.
    """
    # ⏱️ Time Complexity: O(1)
    try:
        # ip_address will raise ValueError for invalid addresses
        ipaddress.ip_address(ip_str)
        return True
    except ValueError:
        return False


def run_interactive_checker():
    """
    Main function to run the interactive malicious URL checker with real data.
    """
    # ⏱️ Time Complexity:
    # - Loading data: O(U + I), where U = number of URLs, I = number of IPs
    # - Populating Bloom filters: O(U + I)
    # - Each lookup/query: O(k), where k = number of hash functions in Bloom filter (typically constant)
    # - Overall interactive loop: O(U + I + Q), where Q = number of user queries
    # Therefore total complexity ≈ O(U + I + Q)
    #  Step 1: Load the data
    # The filename should match what you saved the CSV as.
    URL_DATASET_FILENAME = 'datasets/malicious_urls.csv'
    IP_DATASET_FILENAME = 'datasets/bad_ip_dataset.txt'

    known_malicious_urls = load_malicious_urls_from_csv(URL_DATASET_FILENAME)
    known_malicious_ips = load_ips_from_text_file(IP_DATASET_FILENAME)

    if not known_malicious_urls or not known_malicious_ips:
        print("Error: Could not load malicious datasets. Please verify dataset paths.")
        return

    EXPECTED_URLS_IN_BLACKLIST = len(known_malicious_urls)
    EXPECTED_IPS_IN_BLACKLIST = len(known_malicious_ips)
    # Set how much chance we’re okay with for a “false alarm”
    DESIRED_FP_PROBABILITY = 0.01

    url_filter = MaliciousURLFilter(EXPECTED_URLS_IN_BLACKLIST, DESIRED_FP_PROBABILITY)
    ip_filter = MaliciousIPFilter(known_malicious_ips, DESIRED_FP_PROBABILITY)

    for url in known_malicious_urls:
        url_filter.add(url.strip().lower())
    for ip in known_malicious_ips:
        ip_filter.add(ip.strip().lower())

    while True:
        print("\n==============================")
        print(" Malicious Checker Menu ")
        print("==============================")
        print("1. Check for malicious URL")
        print("2. Check for malicious IP")
        print("3. Check for website")
        print("4. Exit")

        try:
            choice = int(input("Enter your choice: ").strip())
        except ValueError:
            print("Invalid input. Please enter a number between 1 and 4.")
            continue

        if choice == 1:
            print("\n--- Malicious URL Checker ---")
            while True:
                user_input = input("Enter a URL (or type 'exit' to go back): ").strip().lower()
                if user_input == 'exit':
                    break
                if not user_input:
                    continue
                if not is_valid_url(user_input):
                    print("The URL should be in the format: https://example.com")
                    continue

                if user_input in url_filter:
                    print(f"Warning: '{user_input}' is probably malicious.")
                else:
                    print(f"'{user_input}' is not in the blacklist.")

        elif choice == 2:
            print("\n--- Malicious IP Checker ---")
            while True:
                user_input = input("Enter an IP (or type 'exit' to go back): ").strip().lower()
                if user_input == 'exit':
                    break
                if not user_input:
                    continue
                if not is_valid_ip(user_input):
                    print("Invalid IP format. Please enter a valid IPv4 or IPv6 address.")
                    continue

                if user_input in ip_filter:
                    print(f"Warning: '{user_input}' is probably malicious.")
                else:
                    print(f"'{user_input}' is not in the blacklist.")

        elif choice == 3:
            run_crawler_simulation()

        elif choice == 4:
            print("Exiting program.")
            break

        else:
            print("Invalid choice. Please select between 1 and 4.")

if __name__ == "__main__":
    run_interactive_checker()