-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhours_per_article_exec.py
More file actions
90 lines (75 loc) · 3.72 KB
/
hours_per_article_exec.py
File metadata and controls
90 lines (75 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Calculates average time spent per article for a given obsession, with a min
page view threshold (num) to include
Formula - for a given obsession sum of total time spent for all articles,
divided by number of articles that comprise of that obsession
"""
import numpy as np
import pandas as pd
def main(df_pv_in, df_time_in, num=500):
"""primary function: pv_data and time_data should be from same time
interval;
num refers to threshold of pageviews; so default num=500 means that only
articles with a minimum of 500 page views will be considered
returns- sorted DataFrame
"""
dfpv = df_pv_in.groupby('article.id').sum().reset_index()
dfpv = dfpv[dfpv['result'] >= num]
dftime = df_time_in.groupby('article.id').sum().reset_index()
df = dfpv.merge(dftime, on='article.id')
df.columns = ['article.id', 'page views', 'time']
df = df[['article.id', 'page views', 'time']]
df_unique_obsess = unique_obsessions(df_pv_in, df_time_in, num=num)
df = df.merge(df_unique_obsess, on='article.id')
df = df.sort_values('page views', ascending=False)
storage = {}
for obsession in set(df['article.obsessions']):
dft = df[df['article.obsessions'] == obsession]
storage.setdefault('total time (h)', []).append(
int(dft['time'].sum() / 3600))
storage.setdefault('unique articles > 500 pv', []).append(
dft['article.id'].nunique())
storage.setdefault('obsession', []).append(obsession)
df_obsess_totals = pd.DataFrame(storage).sort_values(
'total time (h)', ascending=False)
title_t1 = 'avg cumulative time per article (h)'
df_obsess_totals[title_t1] = (df_obsess_totals['total time (h)'] /
df_obsess_totals['unique articles > 500 pv'])
df_obsess_totals[title_t1] = df_obsess_totals[title_t1].apply(
lambda x: int(x))
df_obsess_totals = df_obsess_totals.sort_values(title_t1, ascending=False)
df_obsess_totals.index = np.arange(len(df_obsess_totals))
return df_obsess_totals
def unique_obsessions(df_pv_in, df_time_in, num = 500):
"""
"""
pv = df_pv_in.groupby('article.id').sum().reset_index()
pv = pv[pv['result'] >= num]
print(pv['article.id'].nunique())
tt = df_time_in.groupby('article.id').sum().reset_index()
df = pv.merge(tt, on='article.id')
#remove articles that don't make the cut; IN says that the articles are in the article_list
article_list = set(df['article.id'])
df_pv = df_pv_in.copy()
df_pv['in'] = df_pv['article.id'].apply(lambda x: x in article_list)
#do a first cut of removing duplicates
df_pv = df_pv[df_pv['in'] == True]
df_pv = df_pv[['article.id', 'article.obsessions']].drop_duplicates()
#find the remaining duplicates
df_pv2 = df_pv.groupby('article.id').count().reset_index()
remaining_dupes = set(df_pv2[df_pv2['article.obsessions'] > 1]['article.id'])
df_pv['dupe'] = df_pv['article.id'].apply(lambda x: x in remaining_dupes)
df_unique = df_pv[df_pv['dupe'] != True]
df_dupe = df_pv[df_pv['dupe'] == True].copy()
obsess_storage = {}
tracker = len(set(df_dupe['article.id']))
for i, article in enumerate(set(df_dupe['article.id'])):
x1 = df_pv[df_pv['article.id'] == article]
obsess_storage.setdefault('article.obsessions', []).append(x1['article.obsessions'].values[0])
obsess_storage.setdefault('article.id', []).append(article)
if int(i / tracker * 100) % 10 == 0:
print(int(i / tracker * 100), '% ', end='')
df_FINAL = df_unique[['article.id', 'article.obsessions']].append(pd.DataFrame(obsess_storage))
return df_FINAL