From 1d2f5325c96d7498229c1b1851a6645107f38574 Mon Sep 17 00:00:00 2001
From: Olga Lyashevska <olga@herenstraat.nl>
Date: Mon, 8 Apr 2024 18:34:29 +0200
Subject: [PATCH 1/2] Title tokanization

---
 requirements.txt | 33 +++++++++++++++++++++++++++++
 tokanization.py  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 tokanization.py

diff --git a/requirements.txt b/requirements.txt
index 87ef5f6..d1d746c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,37 +1,70 @@
+asttokens==2.4.1
+bcrypt==4.1.2
 black==24.2.0
 certifi==2023.11.17
+cffi==1.16.0
 charset-normalizer==3.3.2
 click==8.1.7
+comm==0.2.2
 contourpy==1.2.0
+cryptography==42.0.5
 cycler==0.12.1
+debugpy==1.8.1
+decorator==5.1.1
 exceptiongroup==1.2.0
+executing==2.0.1
 fonttools==4.49.0
 idna==3.6
 iniconfig==2.0.0
+ipykernel==6.29.3
+ipython==8.22.2
+jedi==0.19.1
+Jinja2==3.1.3
 joblib==1.3.2
 Js2Py==0.74
+jupyter_client==8.6.1
+jupyter_core==5.7.2
 kiwisolver==1.4.5
+kunefe @ file:///home/olga/Documents/delftuni-mess/kunefe
+MarkupSafe==2.1.5
 matplotlib==3.8.3
+matplotlib-inline==0.1.6
 mypy-extensions==1.0.0
+nest-asyncio==1.6.0
 nltk==3.8.1
 numpy==1.26.4
 packaging==23.2
 pandas==2.2.1
+paramiko==3.4.0
+parso==0.8.3
 pathspec==0.12.1
+pexpect==4.9.0
 pillow==10.2.0
 platformdirs==4.2.0
 pluggy==1.4.0
+prompt-toolkit==3.0.43
+psutil==5.9.8
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+Pygments==2.17.2
 pyjsparser==2.7.1
+PyNaCl==1.5.0
 pyparsing==3.1.2
 pytest==8.0.2
 python-dateutil==2.9.0.post0
 pytz==2024.1
+pyzmq==25.1.2
 regex==2023.12.25
 requests==2.31.0
 seaborn==0.13.2
 six==1.16.0
+stack-data==0.6.3
+tornado==6.4
 tqdm==4.66.2
+traitlets==5.14.2
 tube_dl==5.1.1
 tzdata==2024.1
 tzlocal==5.2
 urllib3==2.1.0
+wcwidth==0.2.13
diff --git a/tokanization.py b/tokanization.py
new file mode 100644
index 0000000..0561dec
--- /dev/null
+++ b/tokanization.py
@@ -0,0 +1,54 @@
+# Description: This script will take the data from the csv file and tokenize the data
+import string
+
+import pandas as pd
+
+df = pd.read_csv("porn-with-dates-2022.csv")
+
+
+# TODO gives error on df.title[2]
+def extract_words(text):
+    temp = text.split()  # Split the text on whitespace
+    text_words = []
+
+    for word in temp:
+        # Remove any punctuation characters present in the beginning of the word
+        while word[0] in string.punctuation:
+            word = word[1:]
+
+        # Remove any punctuation characters present in the end of the word
+        while word[-1] in string.punctuation:
+            word = word[:-1]
+
+        # Append this word into our list of words.
+        text_words.append(word.lower())
+
+    return text_words
+
+
+# apply to the whole title column
+# df.title_words = df.title.apply(extract_words)
+# TODO title[2] is breaking
+title_words = extract_words(df.title[1])
+print(title_words)
+
+word_dict = {}
+word_list = []
+vocabulary_size = 0
+title_tokens = []
+
+for word in title_words:
+    # If we are seeing this word for the first time, create an id for it and added it to our word dictionary
+    if word not in word_dict:
+        word_dict[word] = vocabulary_size
+        word_list.append(word)
+        vocabulary_size += 1
+
+    # add the token corresponding to the current word to the tokenized text.
+    title_tokens.append(word_dict[word])
+
+print("Word list:", word_list, "\n\n Word dictionary:")
+word_dict
+
+
+print(title_tokens)

From 88a1b5ca4d87ca89ab4fa89e9896767fec14c33c Mon Sep 17 00:00:00 2001
From: Olga Lyashevska <olga@herenstraat.nl>
Date: Tue, 30 Apr 2024 15:08:16 +0200
Subject: [PATCH 2/2] Tokanization

---
 requirements.txt   | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 title_topic_mod.py |  3 +++
 tokanization.py    | 41 ++++++++++++++++++++++++++++----
 3 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d1d746c..c1abdcc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,32 @@
+annotated-types==0.6.0
 asttokens==2.4.1
 bcrypt==4.1.2
+bertopic==0.16.0
 black==24.2.0
+blis==0.7.11
+catalogue==2.0.10
 certifi==2023.11.17
 cffi==1.16.0
 charset-normalizer==3.3.2
 click==8.1.7
+cloudpathlib==0.16.0
 comm==0.2.2
+confection==0.1.4
 contourpy==1.2.0
 cryptography==42.0.5
 cycler==0.12.1
+cymem==2.0.8
+Cython==0.29.37
 debugpy==1.8.1
 decorator==5.1.1
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
 exceptiongroup==1.2.0
 executing==2.0.1
+filelock==3.13.4
 fonttools==4.49.0
+fsspec==2024.3.1
+hdbscan==0.8.33
+huggingface-hub==0.22.2
 idna==3.6
 iniconfig==2.0.0
 ipykernel==6.29.3
@@ -26,13 +39,31 @@ jupyter_client==8.6.1
 jupyter_core==5.7.2
 kiwisolver==1.4.5
 kunefe @ file:///home/olga/Documents/delftuni-mess/kunefe
+langcodes==3.3.0
+llvmlite==0.42.0
 MarkupSafe==2.1.5
 matplotlib==3.8.3
 matplotlib-inline==0.1.6
+mpmath==1.3.0
+murmurhash==1.0.10
 mypy-extensions==1.0.0
 nest-asyncio==1.6.0
+networkx==3.3
 nltk==3.8.1
+numba==0.59.1
 numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.19.3
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.1.105
 packaging==23.2
 pandas==2.2.1
 paramiko==3.4.0
@@ -41,30 +72,58 @@ pathspec==0.12.1
 pexpect==4.9.0
 pillow==10.2.0
 platformdirs==4.2.0
+plotly==5.20.0
 pluggy==1.4.0
+preshed==3.0.9
 prompt-toolkit==3.0.43
 psutil==5.9.8
 ptyprocess==0.7.0
 pure-eval==0.2.2
 pycparser==2.21
+pydantic==2.6.4
+pydantic_core==2.16.3
 Pygments==2.17.2
 pyjsparser==2.7.1
 PyNaCl==1.5.0
+pynndescent==0.5.12
 pyparsing==3.1.2
 pytest==8.0.2
 python-dateutil==2.9.0.post0
 pytz==2024.1
+PyYAML==6.0.1
 pyzmq==25.1.2
 regex==2023.12.25
 requests==2.31.0
+safetensors==0.4.2
+scikit-learn==1.4.2
+scipy==1.13.0
 seaborn==0.13.2
+sentence-transformers==2.6.1
 six==1.16.0
+smart-open==6.4.0
+spacy==3.7.4
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
 stack-data==0.6.3
+sympy==1.12
+tenacity==8.2.3
+thinc==8.2.3
+threadpoolctl==3.4.0
+tokenizers==0.15.2
+torch==2.2.2
 tornado==6.4
 tqdm==4.66.2
 traitlets==5.14.2
+transformers==4.39.3
+triton==2.2.0
 tube_dl==5.1.1
+typer==0.9.4
+typing_extensions==4.11.0
 tzdata==2024.1
 tzlocal==5.2
+umap-learn==0.5.6
 urllib3==2.1.0
+wasabi==1.1.2
 wcwidth==0.2.13
+weasel==0.3.4
diff --git a/title_topic_mod.py b/title_topic_mod.py
index d348294..486d2e2 100644
--- a/title_topic_mod.py
+++ b/title_topic_mod.py
@@ -80,3 +80,6 @@ def display_topics(model, feature_names, no_top_words):
 
 no_top_words = 10
 display_topics(model, title_freq_features, no_top_words)
+
+# TRY BERTopic
+# TRY Spacey
diff --git a/tokanization.py b/tokanization.py
index 0561dec..013659d 100644
--- a/tokanization.py
+++ b/tokanization.py
@@ -1,13 +1,46 @@
-# Description: This script will take the data from the csv file and tokenize the data
-import string
+# This script is used to tokenize words in the title column
 
+import string
+import spacy
 import pandas as pd
 
 df = pd.read_csv("porn-with-dates-2022.csv")
 
+# print 20 random titles
+import random
+random.choices(df.title, k=20)
+
+# create a list of titles
+titles = [_ for _ in df.title]
+
+# download model
+# python -m spacy download en_core_web_sm
+nlp = spacy.load("en_core_web_sm")
+
+# lets try to detect videos with 'blowjob' in the title
+def has_blowjob(text):
+    doc = nlp(text)
+    for t in doc:
+        if t.lower_ in ["blowjob", "blow job"]:
+            if t.pos_!='VERB':
+                return True
+    return False
+    
+
+g = (title for title in titles if has_blowjob(title))
+[next(g) for i in range(10)]
 
-# TODO gives error on df.title[2]
+from spacy import displacy  
+displacy.render(nlp("Angel Kisses Hot Cock | Gentle Blowjob and Mouth Full of Cum | Luxury Girl"))
+spacy.explain("amod")
+
+
+
+
+
+# TODO cleaning the data
 def extract_words(text):
+    """Extract words from a text and return them in a list."""s
     temp = text.split()  # Split the text on whitespace
     text_words = []
 
@@ -48,7 +81,5 @@ def extract_words(text):
     title_tokens.append(word_dict[word])
 
 print("Word list:", word_list, "\n\n Word dictionary:")
-word_dict
-
 
 print(title_tokens)