From 1d2f5325c96d7498229c1b1851a6645107f38574 Mon Sep 17 00:00:00 2001 From: Olga Lyashevska Date: Mon, 8 Apr 2024 18:34:29 +0200 Subject: [PATCH 1/2] Title tokanization --- requirements.txt | 33 +++++++++++++++++++++++++++++ tokanization.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tokanization.py diff --git a/requirements.txt b/requirements.txt index 87ef5f6..d1d746c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,37 +1,70 @@ +asttokens==2.4.1 +bcrypt==4.1.2 black==24.2.0 certifi==2023.11.17 +cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 +comm==0.2.2 contourpy==1.2.0 +cryptography==42.0.5 cycler==0.12.1 +debugpy==1.8.1 +decorator==5.1.1 exceptiongroup==1.2.0 +executing==2.0.1 fonttools==4.49.0 idna==3.6 iniconfig==2.0.0 +ipykernel==6.29.3 +ipython==8.22.2 +jedi==0.19.1 +Jinja2==3.1.3 joblib==1.3.2 Js2Py==0.74 +jupyter_client==8.6.1 +jupyter_core==5.7.2 kiwisolver==1.4.5 +kunefe @ file:///home/olga/Documents/delftuni-mess/kunefe +MarkupSafe==2.1.5 matplotlib==3.8.3 +matplotlib-inline==0.1.6 mypy-extensions==1.0.0 +nest-asyncio==1.6.0 nltk==3.8.1 numpy==1.26.4 packaging==23.2 pandas==2.2.1 +paramiko==3.4.0 +parso==0.8.3 pathspec==0.12.1 +pexpect==4.9.0 pillow==10.2.0 platformdirs==4.2.0 pluggy==1.4.0 +prompt-toolkit==3.0.43 +psutil==5.9.8 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.17.2 pyjsparser==2.7.1 +PyNaCl==1.5.0 pyparsing==3.1.2 pytest==8.0.2 python-dateutil==2.9.0.post0 pytz==2024.1 +pyzmq==25.1.2 regex==2023.12.25 requests==2.31.0 seaborn==0.13.2 six==1.16.0 +stack-data==0.6.3 +tornado==6.4 tqdm==4.66.2 +traitlets==5.14.2 tube_dl==5.1.1 tzdata==2024.1 tzlocal==5.2 urllib3==2.1.0 +wcwidth==0.2.13 diff --git a/tokanization.py b/tokanization.py new file mode 100644 index 0000000..0561dec --- /dev/null +++ b/tokanization.py @@ -0,0 +1,54 @@ +# Description: This script will take the data from the csv file and tokenize the data +import string + +import pandas as pd + +df = pd.read_csv("porn-with-dates-2022.csv") + + +# TODO gives error on df.title[2] +def extract_words(text): + temp = text.split() # Split the text on whitespace + text_words = [] + + for word in temp: + # Remove any punctuation characters present in the beginning of the word + while word[0] in string.punctuation: + word = word[1:] + + # Remove any punctuation characters present in the end of the word + while word[-1] in string.punctuation: + word = word[:-1] + + # Append this word into our list of words. + text_words.append(word.lower()) + + return text_words + + +# apply to the whole title column +# df.title_words = df.title.apply(extract_words) +# TODO title[2] is breaking +title_words = extract_words(df.title[1]) +print(title_words) + +word_dict = {} +word_list = [] +vocabulary_size = 0 +title_tokens = [] + +for word in title_words: + # If we are seeing this word for the first time, create an id for it and added it to our word dictionary + if word not in word_dict: + word_dict[word] = vocabulary_size + word_list.append(word) + vocabulary_size += 1 + + # add the token corresponding to the current word to the tokenized text. + title_tokens.append(word_dict[word]) + +print("Word list:", word_list, "\n\n Word dictionary:") +word_dict + + +print(title_tokens) From 88a1b5ca4d87ca89ab4fa89e9896767fec14c33c Mon Sep 17 00:00:00 2001 From: Olga Lyashevska Date: Tue, 30 Apr 2024 15:08:16 +0200 Subject: [PATCH 2/2] Tokanization --- requirements.txt | 59 ++++++++++++++++++++++++++++++++++++++++++++++ title_topic_mod.py | 3 +++ tokanization.py | 41 ++++++++++++++++++++++++++++---- 3 files changed, 98 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index d1d746c..c1abdcc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,32 @@ +annotated-types==0.6.0 asttokens==2.4.1 bcrypt==4.1.2 +bertopic==0.16.0 black==24.2.0 +blis==0.7.11 +catalogue==2.0.10 certifi==2023.11.17 cffi==1.16.0 charset-normalizer==3.3.2 click==8.1.7 +cloudpathlib==0.16.0 comm==0.2.2 +confection==0.1.4 contourpy==1.2.0 cryptography==42.0.5 cycler==0.12.1 +cymem==2.0.8 +Cython==0.29.37 debugpy==1.8.1 decorator==5.1.1 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 exceptiongroup==1.2.0 executing==2.0.1 +filelock==3.13.4 fonttools==4.49.0 +fsspec==2024.3.1 +hdbscan==0.8.33 +huggingface-hub==0.22.2 idna==3.6 iniconfig==2.0.0 ipykernel==6.29.3 @@ -26,13 +39,31 @@ jupyter_client==8.6.1 jupyter_core==5.7.2 kiwisolver==1.4.5 kunefe @ file:///home/olga/Documents/delftuni-mess/kunefe +langcodes==3.3.0 +llvmlite==0.42.0 MarkupSafe==2.1.5 matplotlib==3.8.3 matplotlib-inline==0.1.6 +mpmath==1.3.0 +murmurhash==1.0.10 mypy-extensions==1.0.0 nest-asyncio==1.6.0 +networkx==3.3 nltk==3.8.1 +numba==0.59.1 numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.19.3 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.1.105 packaging==23.2 pandas==2.2.1 paramiko==3.4.0 @@ -41,30 +72,58 @@ pathspec==0.12.1 pexpect==4.9.0 pillow==10.2.0 platformdirs==4.2.0 +plotly==5.20.0 pluggy==1.4.0 +preshed==3.0.9 prompt-toolkit==3.0.43 psutil==5.9.8 ptyprocess==0.7.0 pure-eval==0.2.2 pycparser==2.21 +pydantic==2.6.4 +pydantic_core==2.16.3 Pygments==2.17.2 pyjsparser==2.7.1 PyNaCl==1.5.0 +pynndescent==0.5.12 pyparsing==3.1.2 pytest==8.0.2 python-dateutil==2.9.0.post0 pytz==2024.1 +PyYAML==6.0.1 pyzmq==25.1.2 regex==2023.12.25 requests==2.31.0 +safetensors==0.4.2 +scikit-learn==1.4.2 +scipy==1.13.0 seaborn==0.13.2 +sentence-transformers==2.6.1 six==1.16.0 +smart-open==6.4.0 +spacy==3.7.4 +spacy-legacy==3.0.12 +spacy-loggers==1.0.5 +srsly==2.4.8 stack-data==0.6.3 +sympy==1.12 +tenacity==8.2.3 +thinc==8.2.3 +threadpoolctl==3.4.0 +tokenizers==0.15.2 +torch==2.2.2 tornado==6.4 tqdm==4.66.2 traitlets==5.14.2 +transformers==4.39.3 +triton==2.2.0 tube_dl==5.1.1 +typer==0.9.4 +typing_extensions==4.11.0 tzdata==2024.1 tzlocal==5.2 +umap-learn==0.5.6 urllib3==2.1.0 +wasabi==1.1.2 wcwidth==0.2.13 +weasel==0.3.4 diff --git a/title_topic_mod.py b/title_topic_mod.py index d348294..486d2e2 100644 --- a/title_topic_mod.py +++ b/title_topic_mod.py @@ -80,3 +80,6 @@ def display_topics(model, feature_names, no_top_words): no_top_words = 10 display_topics(model, title_freq_features, no_top_words) + +# TRY BERTopic +# TRY Spacey diff --git a/tokanization.py b/tokanization.py index 0561dec..013659d 100644 --- a/tokanization.py +++ b/tokanization.py @@ -1,13 +1,46 @@ -# Description: This script will take the data from the csv file and tokenize the data -import string +# This script is used to tokenize words in the title column +import string +import spacy import pandas as pd df = pd.read_csv("porn-with-dates-2022.csv") +# print 20 random titles +import random +random.choices(df.title, k=20) + +# create a list of titles +titles = [_ for _ in df.title] + +# download model +# python -m spacy download en_core_web_sm +nlp = spacy.load("en_core_web_sm") + +# lets try to detect videos with 'blowjob' in the title +def has_blowjob(text): + doc = nlp(text) + for t in doc: + if t.lower_ in ["blowjob", "blow job"]: + if t.pos_!='VERB': + return True + return False + + +g = (title for title in titles if has_blowjob(title)) +[next(g) for i in range(10)] -# TODO gives error on df.title[2] +from spacy import displacy +displacy.render(nlp("Angel Kisses Hot Cock | Gentle Blowjob and Mouth Full of Cum | Luxury Girl")) +spacy.explain("amod") + + + + + +# TODO cleaning the data def extract_words(text): + """Extract words from a text and return them in a list."""s temp = text.split() # Split the text on whitespace text_words = [] @@ -48,7 +81,5 @@ def extract_words(text): title_tokens.append(word_dict[word]) print("Word list:", word_list, "\n\n Word dictionary:") -word_dict - print(title_tokens)