forked from ryyhan/PhishingDomainDetection
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
23 lines (18 loc) · 676 Bytes
/
preprocess.py
File metadata and controls
23 lines (18 loc) · 676 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pickle
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
# Tokenization, Stemming, Vectorization
tokenizer = RegexpTokenizer(r"[A-Za-z]+")
stemmer = SnowballStemmer("english")
cv = CountVectorizer()
with open("vectorizer.pkl", "rb") as file:
cvectorizer = pickle.load(file)
def prepare_data(text):
tokens = tokenizer.tokenize(text)
stemmed_tokens = [stemmer.stem(token) for token in tokens]
stemmed_text = " ".join(stemmed_tokens)
vectorized_text = cvectorizer.transform([text])
return vectorized_text