-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathiab_google_mapping.py
More file actions
34 lines (26 loc) · 1.17 KB
/
iab_google_mapping.py
File metadata and controls
34 lines (26 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import urllib.request
import json
import tldextract
print('start')
iabVendorListUrl = 'https://vendorlist.consensu.org/vendorlist.json'
googleProvidersUrl = 'https://storage.googleapis.com/adx-rtb-dictionaries/providers.csv'
outputFilePath = 'iab_google_mapping.csv'
def clear_url(origin_url):
return '.'.join(part for part in tldextract.extract(origin_url) if part).replace('www.','')
# download and prepare IAB vendors
with urllib.request.urlopen(iabVendorListUrl) as response:
vendorlist = json.loads(response.read().decode())
df_iab = json_normalize(vendorlist['vendors'])
df_iab['policyUrl'] = df_iab['policyUrl'].apply(clear_url)
# download and prepare Google providers
df_google = pd.read_csv(googleProvidersUrl)
df_google['policy_url'] = df_google['policy_url'].apply(clear_url)
# intersect IAB and Google
intersected_df = pd.merge(df_iab, df_google, how='inner', left_on='policyUrl', right_on='policy_url')
clean_df = intersected_df[['id','name','policyUrl','provider_id','provider_name']]
# write result to csv
clean_df.to_csv(outputFilePath, index=False)
print(f'{outputFilePath} is ready')