diff --git a/pornhub_lineplot_over_years.py b/pornhub_lineplot_over_years.py new file mode 100644 index 0000000..8558a0a --- /dev/null +++ b/pornhub_lineplot_over_years.py @@ -0,0 +1,95 @@ +import pandas as pd +from collections import Counter +import matplotlib.pyplot as plt + +# Read the CSV file +df = pd.read_csv('porn-with-dates-2022.csv') + +# Convert the 'date' column to datetime +df['date'] = pd.to_datetime(df['date']) + +# Determine the latest year in the dataset +latest_year = int(df['date'].dt.year.max()) + +# Initialize a dictionary to store the popularity of top tags for the latest year +popularity_latest_year = {} +popularity_latest_year_raw = {} + +# Filter data for the latest year +df_latest_year = df[df['date'].dt.year == latest_year] + +# Get the total number of views for the latest year +total_views_latest_year = df_latest_year['views'].sum() + + +# Initialize an empty dictionary to store the total views for each tag +tag_views_latest_year = {} + +# Iterate over each row in the latest year DataFrame +for index, row in df_latest_year.iterrows(): + # Convert the string representation of tags to a list + tags = eval(row['categories']) + + # Iterate over each tag in the list + for tag in tags: + # Add the number of views associated with the tag to the dictionary + tag_views_latest_year[tag] = tag_views_latest_year.get(tag, 0) + row['views'] + +# Convert the dictionary to a pandas Series +tag_views_latest_year = pd.Series(tag_views_latest_year) + +# Sort the Series by the total views in descending order +tag_views_latest_year = tag_views_latest_year.sort_values(ascending=False) + + +# Calculate the percentage of total views for each tag in the latest year +for tag, count in tag_views_latest_year.items(): + percentage = (count / total_views_latest_year) * 100 + popularity_latest_year_raw[tag] = count + popularity_latest_year[tag] = percentage + +# Get the top 10 tags for the latest year +top_tags_latest_year = pd.Series(popularity_latest_year).nlargest(10) +top_tags_latest_year_raw = pd.Series(popularity_latest_year_raw).nlargest(10) + + +# Print distribution of views among the top tags in the latest year +print(f"Top 10 tags in {latest_year}:") +top_tags = [] +top_views = [] +other_views = 0 + +# Iterate over (tag, views) pairs in the top tags Series +for tag, views in top_tags_latest_year_raw.items(): + percentage = (views / total_views_latest_year) * 100 + print(f"{tag}: {views} views ({percentage:.2f}% of total)") + top_tags.append(tag) + top_views.append(percentage) + +# Initialize a set to store video IDs associated with top 10 tags +videos_with_top_tags = set() + +# Iterate over the top tags to collect video IDs +for tag in top_tags_latest_year_raw.index: + # Get the DataFrame rows where the tag appears + rows_with_tag = df_latest_year[df_latest_year['categories'].str.contains(tag)] + # Add the IDs of these rows to the set + videos_with_top_tags.update(rows_with_tag['url']) + +# Calculate the total views for 'other' tags +other_views = df_latest_year[~df_latest_year['url'].isin(videos_with_top_tags)]['views'].sum() + +top_tags.append('Other') +percentage_other = (other_views / total_views_latest_year) * 100 +top_views.append(percentage_other) +print(f"Other: {other_views} views ({percentage_other:.2f}% of total)") + +# Plot distribution of views among the top tags in the latest year +plt.figure(figsize=(10, 6)) +plt.bar(top_tags, top_views) +plt.xlabel('Tags') +plt.ylabel('Number of Views (%)') +plt.title(f'Distribution of Views Among Top Tags in {latest_year}') +plt.xticks(rotation=45) +plt.tight_layout() +plt.show() diff --git a/pornhub_tags_regression_over_years.py b/pornhub_tags_regression_over_years.py new file mode 100644 index 0000000..89e5472 --- /dev/null +++ b/pornhub_tags_regression_over_years.py @@ -0,0 +1,70 @@ +import pandas as pd +from sklearn.linear_model import LinearRegression +import numpy as np +from scipy.stats import linregress + +# Read the CSV file +df = pd.read_csv('porn-with-dates-2022.csv') + +# Convert the 'date' column to datetime +df['date'] = pd.to_datetime(df['date']) + +# Determine the unique tags in the dataset +unique_tags = set() +for categories in df['categories']: + unique_tags.update(eval(categories)) + +# Determine the unique years in the dataset +unique_years = sorted(df['date'].dt.year.unique()) + +# Initialize a dictionary to store the p-values of linear regression for each tag +tag_stats = {} + +# Iterate over each tag +for tag in unique_tags: + # Initialize lists to store years and normalized views + years = [] + normalized_views = [] + + # Iterate over each year + for year in unique_years: + # Print out progress for the researchers own sanity + print(f'Running on tag "{tag}" in year {year}') + + # Filter data for the current year + total_year_data = df[(df['date'].dt.year == year)] + + # Calculate total views for the current year + total_year_views = total_year_data['views'].sum() + + # Filter data for the current tag and year + tag_year_data = df[(df['date'].dt.year == year) & df['categories'].apply(lambda x: tag in eval(x))] + + # Calculate total views for current tag in current year + tag_year_views = tag_year_data['views'].sum() + + # Append the year + years.append(year) + + # Append the normalized views for the current year + normalized_views.append(tag_year_views / total_year_views if total_year_views != 0 else 0) + + # Perform linear regression on years and normalized views + slope, _, r_value, p_value, _ = linregress(years, normalized_views) + + # Store the statistics of linear regression for the current tag + tag_stats[tag] = {'slope': slope, 'r_value': r_value, 'p_value': p_value} + + +# Save the list of significant tags along with their statistics to a tab-delimited file +significant_tags = [] +with open('significant_tags.txt', 'w') as file: + file.write('Tag\tSlope\tR Value\tP Value\n') + for tag, stats in tag_stats.items(): + if stats['p_value'] < 0.05: + file.write(f"{tag}\t{stats['slope']}\t{stats['r_value']}\t{stats['p_value']}\n") + significant_tags.append(tag) + +print("Tags with significant changes in view count over years:") +for tag in significant_tags: + print(tag) diff --git a/total_views_over_years_pornhub.py b/total_views_over_years_pornhub.py new file mode 100644 index 0000000..8558a0a --- /dev/null +++ b/total_views_over_years_pornhub.py @@ -0,0 +1,95 @@ +import pandas as pd +from collections import Counter +import matplotlib.pyplot as plt + +# Read the CSV file +df = pd.read_csv('porn-with-dates-2022.csv') + +# Convert the 'date' column to datetime +df['date'] = pd.to_datetime(df['date']) + +# Determine the latest year in the dataset +latest_year = int(df['date'].dt.year.max()) + +# Initialize a dictionary to store the popularity of top tags for the latest year +popularity_latest_year = {} +popularity_latest_year_raw = {} + +# Filter data for the latest year +df_latest_year = df[df['date'].dt.year == latest_year] + +# Get the total number of views for the latest year +total_views_latest_year = df_latest_year['views'].sum() + + +# Initialize an empty dictionary to store the total views for each tag +tag_views_latest_year = {} + +# Iterate over each row in the latest year DataFrame +for index, row in df_latest_year.iterrows(): + # Convert the string representation of tags to a list + tags = eval(row['categories']) + + # Iterate over each tag in the list + for tag in tags: + # Add the number of views associated with the tag to the dictionary + tag_views_latest_year[tag] = tag_views_latest_year.get(tag, 0) + row['views'] + +# Convert the dictionary to a pandas Series +tag_views_latest_year = pd.Series(tag_views_latest_year) + +# Sort the Series by the total views in descending order +tag_views_latest_year = tag_views_latest_year.sort_values(ascending=False) + + +# Calculate the percentage of total views for each tag in the latest year +for tag, count in tag_views_latest_year.items(): + percentage = (count / total_views_latest_year) * 100 + popularity_latest_year_raw[tag] = count + popularity_latest_year[tag] = percentage + +# Get the top 10 tags for the latest year +top_tags_latest_year = pd.Series(popularity_latest_year).nlargest(10) +top_tags_latest_year_raw = pd.Series(popularity_latest_year_raw).nlargest(10) + + +# Print distribution of views among the top tags in the latest year +print(f"Top 10 tags in {latest_year}:") +top_tags = [] +top_views = [] +other_views = 0 + +# Iterate over (tag, views) pairs in the top tags Series +for tag, views in top_tags_latest_year_raw.items(): + percentage = (views / total_views_latest_year) * 100 + print(f"{tag}: {views} views ({percentage:.2f}% of total)") + top_tags.append(tag) + top_views.append(percentage) + +# Initialize a set to store video IDs associated with top 10 tags +videos_with_top_tags = set() + +# Iterate over the top tags to collect video IDs +for tag in top_tags_latest_year_raw.index: + # Get the DataFrame rows where the tag appears + rows_with_tag = df_latest_year[df_latest_year['categories'].str.contains(tag)] + # Add the IDs of these rows to the set + videos_with_top_tags.update(rows_with_tag['url']) + +# Calculate the total views for 'other' tags +other_views = df_latest_year[~df_latest_year['url'].isin(videos_with_top_tags)]['views'].sum() + +top_tags.append('Other') +percentage_other = (other_views / total_views_latest_year) * 100 +top_views.append(percentage_other) +print(f"Other: {other_views} views ({percentage_other:.2f}% of total)") + +# Plot distribution of views among the top tags in the latest year +plt.figure(figsize=(10, 6)) +plt.bar(top_tags, top_views) +plt.xlabel('Tags') +plt.ylabel('Number of Views (%)') +plt.title(f'Distribution of Views Among Top Tags in {latest_year}') +plt.xticks(rotation=45) +plt.tight_layout() +plt.show() diff --git a/total_views_over_years_xhamster.py b/total_views_over_years_xhamster.py new file mode 100644 index 0000000..4dcfa68 --- /dev/null +++ b/total_views_over_years_xhamster.py @@ -0,0 +1,95 @@ +import pandas as pd +from collections import Counter +import matplotlib.pyplot as plt + +# Read the CSV file +df = pd.read_csv('xhamster.csv') + +# Convert the 'upload_date' column to datetime +df['upload_date'] = pd.to_datetime(df['upload_date']) + +# Determine the latest year in the dataset +latest_year = int(df['upload_date'].dt.year.max()) + +# Initialize a dictionary to store the popularity of top tags for the latest year +popularity_latest_year = {} +popularity_latest_year_raw = {} + +# Filter data for the latest year +df_latest_year = df[df['upload_date'].dt.year == latest_year] + +# Get the total number of views for the latest year +total_views_latest_year = df_latest_year['nb_views'].sum() + + +# Initialize an empty dictionary to store the total views for each tag +tag_views_latest_year = {} + +# Iterate over each row in the latest year DataFrame +for index, row in df_latest_year.iterrows(): + # Convert the string representation of tags to a list + tags = eval(row['channels']) + + # Iterate over each tag in the list + for tag in tags: + # Add the number of views associated with the tag to the dictionary + tag_views_latest_year[tag] = tag_views_latest_year.get(tag, 0) + row['nb_views'] + +# Convert the dictionary to a pandas Series +tag_views_latest_year = pd.Series(tag_views_latest_year) + +# Sort the Series by the total views in descending order +tag_views_latest_year = tag_views_latest_year.sort_values(ascending=False) + + +# Calculate the percentage of total views for each tag in the latest year +for tag, count in tag_views_latest_year.items(): + percentage = (count / total_views_latest_year) * 100 + popularity_latest_year_raw[tag] = count + popularity_latest_year[tag] = percentage + +# Get the top 10 tags for the latest year +top_tags_latest_year = pd.Series(popularity_latest_year).nlargest(10) +top_tags_latest_year_raw = pd.Series(popularity_latest_year_raw).nlargest(10) + + +# Print distribution of views among the top tags in the latest year +print(f"Top 10 tags in {latest_year}:") +top_tags = [] +top_views = [] +other_views = 0 + +# Iterate over (tag, views) pairs in the top tags Series +for tag, views in top_tags_latest_year_raw.items(): + percentage = (views / total_views_latest_year) * 100 + print(f"{tag}: {views} views ({percentage:.2f}% of total)") + top_tags.append(tag) + top_views.append(percentage) + +# Initialize a set to store video IDs associated with top 10 tags +videos_with_top_tags = set() + +# Iterate over the top tags to collect video IDs +for tag in top_tags_latest_year_raw.index: + # Get the DataFrame rows where the tag appears + rows_with_tag = df_latest_year[df_latest_year['channels'].str.contains(tag)] + # Add the IDs of these rows to the set + videos_with_top_tags.update(rows_with_tag['id']) + +# Calculate the total views for 'other' tags +other_views = df_latest_year[~df_latest_year['id'].isin(videos_with_top_tags)]['nb_views'].sum() + +top_tags.append('Other') +percentage_other = (other_views / total_views_latest_year) * 100 +top_views.append(percentage_other) +print(f"Other: {other_views} views ({percentage_other:.2f}% of total)") + +# Plot distribution of views among the top tags in the latest year +plt.figure(figsize=(10, 6)) +plt.bar(top_tags, top_views) +plt.xlabel('Tags') +plt.ylabel('Number of Views (%)') +plt.title(f'Distribution of Views Among Top Tags in {latest_year}') +plt.xticks(rotation=45) +plt.tight_layout() +plt.show() diff --git a/xhamster_views_over_years_lineplot.py b/xhamster_views_over_years_lineplot.py new file mode 100644 index 0000000..1662330 --- /dev/null +++ b/xhamster_views_over_years_lineplot.py @@ -0,0 +1,54 @@ +import pandas as pd +import matplotlib.pyplot as plt +import calendar +import os + +# Read the CSV file +df = pd.read_csv('xhamster.csv') + +# Filter out non-date values from the 'date' column +df = df.loc[pd.to_datetime(df['upload_date'], errors='coerce').notnull()] + +# # Convert the 'date' column to datetime +# df.loc[:, 'upload_date'] = pd.to_datetime(df['upload_date']) + +# Assuming the date format is in YYYY-MM-DD +df['upload_date'] = pd.to_datetime(df['upload_date'], format='%Y-%m-%d') + +# Group the data by month and year and calculate the total views for each month +monthly_views = df.groupby([df['upload_date'].dt.year.rename('year'), df['upload_date'].dt.month.rename('month')])['nb_views'].sum().reset_index() + +# Reset the index to have separate columns for year and month +monthly_views = monthly_views.reset_index() + +# Drop the duplicate index column +monthly_views = monthly_views.drop(columns=['index']) + +# Rename the columns for clarity +monthly_views.columns = ['Year', 'Month', 'Total Views'] + +# Create a pivot table to have months as columns and years as rows +pivot_monthly_views = monthly_views.pivot(index='Year', columns='Month', values='Total Views') + +# Plot the total views for each month, with each year represented as a different colored line +plt.figure(figsize=(12, 8)) +for year in pivot_monthly_views.index: + plt.plot(pivot_monthly_views.columns, pivot_monthly_views.loc[year], label=str(year)) + +plt.xlabel('Month') +plt.ylabel('Total Views') +plt.title('Total Views of Each Month Across Years') +plt.legend() +plt.grid(True) +plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) + +# # Specify the directory path +# save_dir = os.path.join('..', 'reports') # Goes one level above and into 'reports' directory + +# # Create the directory if it doesn't exist +# os.makedirs(save_dir, exist_ok=True) + +# # Save the plot in the specified directory +# plt.savefig(os.path.join(save_dir, 'xhamster_views_over_years_lineplot.png')) + +plt.show() \ No newline at end of file