Skip to content
This repository was archived by the owner on Sep 19, 2025. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,37 +1,129 @@
annotated-types==0.6.0
asttokens==2.4.1
bcrypt==4.1.2
bertopic==0.16.0
black==24.2.0
blis==0.7.11
catalogue==2.0.10
certifi==2023.11.17
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cloudpathlib==0.16.0
comm==0.2.2
confection==0.1.4
contourpy==1.2.0
cryptography==42.0.5
cycler==0.12.1
cymem==2.0.8
Cython==0.29.37
debugpy==1.8.1
decorator==5.1.1
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
exceptiongroup==1.2.0
executing==2.0.1
filelock==3.13.4
fonttools==4.49.0
fsspec==2024.3.1
hdbscan==0.8.33
huggingface-hub==0.22.2
idna==3.6
iniconfig==2.0.0
ipykernel==6.29.3
ipython==8.22.2
jedi==0.19.1
Jinja2==3.1.3
joblib==1.3.2
Js2Py==0.74
jupyter_client==8.6.1
jupyter_core==5.7.2
kiwisolver==1.4.5
kunefe @ file:///home/olga/Documents/delftuni-mess/kunefe
langcodes==3.3.0
llvmlite==0.42.0
MarkupSafe==2.1.5
matplotlib==3.8.3
matplotlib-inline==0.1.6
mpmath==1.3.0
murmurhash==1.0.10
mypy-extensions==1.0.0
nest-asyncio==1.6.0
networkx==3.3
nltk==3.8.1
numba==0.59.1
numpy==1.26.4
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.19.3
nvidia-nvjitlink-cu12==12.4.127
nvidia-nvtx-cu12==12.1.105
packaging==23.2
pandas==2.2.1
paramiko==3.4.0
parso==0.8.3
pathspec==0.12.1
pexpect==4.9.0
pillow==10.2.0
platformdirs==4.2.0
plotly==5.20.0
pluggy==1.4.0
preshed==3.0.9
prompt-toolkit==3.0.43
psutil==5.9.8
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
pydantic==2.6.4
pydantic_core==2.16.3
Pygments==2.17.2
pyjsparser==2.7.1
PyNaCl==1.5.0
pynndescent==0.5.12
pyparsing==3.1.2
pytest==8.0.2
python-dateutil==2.9.0.post0
pytz==2024.1
PyYAML==6.0.1
pyzmq==25.1.2
regex==2023.12.25
requests==2.31.0
safetensors==0.4.2
scikit-learn==1.4.2
scipy==1.13.0
seaborn==0.13.2
sentence-transformers==2.6.1
six==1.16.0
smart-open==6.4.0
spacy==3.7.4
spacy-legacy==3.0.12
spacy-loggers==1.0.5
srsly==2.4.8
stack-data==0.6.3
sympy==1.12
tenacity==8.2.3
thinc==8.2.3
threadpoolctl==3.4.0
tokenizers==0.15.2
torch==2.2.2
tornado==6.4
tqdm==4.66.2
traitlets==5.14.2
transformers==4.39.3
triton==2.2.0
tube_dl==5.1.1
typer==0.9.4
typing_extensions==4.11.0
tzdata==2024.1
tzlocal==5.2
umap-learn==0.5.6
urllib3==2.1.0
wasabi==1.1.2
wcwidth==0.2.13
weasel==0.3.4
3 changes: 3 additions & 0 deletions title_topic_mod.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,6 @@ def display_topics(model, feature_names, no_top_words):

no_top_words = 10
display_topics(model, title_freq_features, no_top_words)

# TRY BERTopic
# TRY Spacey
85 changes: 85 additions & 0 deletions tokanization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# This script is used to tokenize words in the title column

import string
import spacy
import pandas as pd

df = pd.read_csv("porn-with-dates-2022.csv")

# print 20 random titles
import random
random.choices(df.title, k=20)

# create a list of titles
titles = [_ for _ in df.title]

# download model
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# lets try to detect videos with 'blowjob' in the title
def has_blowjob(text):
doc = nlp(text)
for t in doc:
if t.lower_ in ["blowjob", "blow job"]:
if t.pos_!='VERB':
return True
return False


g = (title for title in titles if has_blowjob(title))
[next(g) for i in range(10)]

from spacy import displacy
displacy.render(nlp("Angel Kisses Hot Cock | Gentle Blowjob and Mouth Full of Cum | Luxury Girl"))
spacy.explain("amod")





# TODO cleaning the data
def extract_words(text):
"""Extract words from a text and return them in a list."""s
temp = text.split() # Split the text on whitespace
text_words = []

for word in temp:
# Remove any punctuation characters present in the beginning of the word
while word[0] in string.punctuation:
word = word[1:]

# Remove any punctuation characters present in the end of the word
while word[-1] in string.punctuation:
word = word[:-1]

# Append this word into our list of words.
text_words.append(word.lower())

return text_words


# apply to the whole title column
# df.title_words = df.title.apply(extract_words)
# TODO title[2] is breaking
title_words = extract_words(df.title[1])
print(title_words)

word_dict = {}
word_list = []
vocabulary_size = 0
title_tokens = []

for word in title_words:
# If we are seeing this word for the first time, create an id for it and added it to our word dictionary
if word not in word_dict:
word_dict[word] = vocabulary_size
word_list.append(word)
vocabulary_size += 1

# add the token corresponding to the current word to the tokenized text.
title_tokens.append(word_dict[word])

print("Word list:", word_list, "\n\n Word dictionary:")

print(title_tokens)