Skip to content

Commit 99d6568

Browse files
committed
Updating scrapers
1 parent 858949d commit 99d6568

100 files changed

Lines changed: 3782 additions & 1478 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,10 @@ scenes/networkData18.py
163163
scenes/networkDirtyFlixRip.py
164164
scenes/siteYesGirlzRip.py
165165
scenes/siteScorelandPlaywright.py
166+
scenes/aggregatorPornbox.py
167+
scenes/generic.py
168+
scenes/genericJSON.py
169+
scenes/generic.py
170+
scenes/aggregatorPornbox.py
171+
scenes/networkAdulttimeAPIFiller.py
172+
scenes/networkARXBucks_OLD_DO_NOT_USE.py

movies/MovieDorcelClub.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ class MovieDorcelClubSpider(BaseSceneScraper):
2929
'tags': '',
3030
'external_id': 'scene/(\\d+)',
3131
'trailer': '',
32-
'pagination': '/en/porn-movie?p=%s'
32+
# ~ 'pagination': '/en/porn-movie?p=%s'
33+
'pagination': '/en/porn-movie-soon?p=%s'
3334
}
3435
custom_scraper_settings = {
3536
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',

performers/networkBadoinkVrPerformer.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def match_path(argument):
2121
class networkBadoinkVrPerformerSpider(BasePerformerScraper):
2222

2323
selector_map = {
24-
'name': '//div[@class="girl-details-info"]/h1/text()',
24+
'name': '//ul[contains(@class, "breadcrumbs")]/li[last()]/a/span/text()',
2525
'image': '//picture/img[@id="girlImage"]/@src',
2626
'nationality': '//ul[@id="girlOptionDetails"]/li/span[contains(text(),"Country")]/following-sibling::span/text()',
2727
'ethnicity': '//ul[@id="girlOptionDetails"]/li/span[contains(text(),"Ethnicity")]/following-sibling::span/text()',
@@ -33,14 +33,14 @@ class networkBadoinkVrPerformerSpider(BasePerformerScraper):
3333
'birthday': '//ul[@id="girlOptionDetails"]/li/span[contains(text(),"Age")]/following-sibling::span/text()',
3434
'bio': '//p[@class="girl-details-bio"]/text()',
3535
'pagination': '?page=%s&hybridview=member',
36-
'external_id': '.*\/(.*)\/$'
36+
'external_id': r'.*/(.*)/$'
3737
}
3838

3939
name = 'BadoinkVrPerformer'
4040
network = 'Badoink VR'
4141
parent = 'Badoink VR'
4242

43-
43+
4444
start_urls = [
4545
'https://badoinkvr.com',
4646
'https://babevr.com',
@@ -49,11 +49,11 @@ class networkBadoinkVrPerformerSpider(BasePerformerScraper):
4949
'https://vrcosplayx.com',
5050
'https://realvr.com',
5151
]
52-
52+
5353
def get_next_page_url(self, base, page):
5454
url = urlparse(base)
5555
match_pagination = match_path(url.netloc)
56-
return self.format_url(base, match_pagination % page)
56+
return self.format_url(base, match_pagination % page)
5757

5858
def get_gender(self, response):
5959
return 'Female'
@@ -64,14 +64,14 @@ def get_performers(self, response):
6464
yield scrapy.Request(
6565
url=self.format_link(response, performer),
6666
callback=self.parse_performer
67-
)
68-
69-
67+
)
68+
69+
7070
def get_height(self, response):
7171
if 'height' in self.selector_map:
7272
height = self.process_xpath(response, self.get_selector_map('height')).get()
7373
if height:
74-
str_height = re.findall('(\d{1,2})', height)
74+
str_height = re.findall(r'(\d{1,2})', height)
7575
if len(str_height):
7676
feet = int(str_height[0])
7777
if len(str_height) > 1:
@@ -80,30 +80,35 @@ def get_height(self, response):
8080
inches = 0
8181
heightcm = str(round(((feet*12)+inches) * 2.54)) + "cm"
8282
return heightcm.strip()
83-
return ''
83+
return ''
84+
8485

85-
8686
def get_cupsize(self, response):
8787
if 'measurements' in self.selector_map:
8888
measurements = self.process_xpath(response, self.get_selector_map('measurements')).get()
8989
if measurements:
9090
if "-" in measurements:
91-
cupsize = re.search('(.*?)-.*', measurements).group(1)
91+
cupsize = re.search(r'(.*?)-.*', measurements).group(1)
9292
if cupsize:
9393
return cupsize.strip()
94-
return ''
94+
return ''
9595

9696
def get_birthday(self, response):
9797
#Birthdate is calculated on Age field. They're assigned a birthdate of date of import - "Age:" years
9898
if 'birthday' in self.selector_map:
9999
age = self.process_xpath(response, self.get_selector_map('birthday')).get()
100100
if age:
101-
age = re.search('(\d+)',age).group(1)
101+
age = re.search(r'(\d+)',age).group(1)
102102
if age:
103103
age = int(age)
104104
if age >= 18 and age <= 99:
105105
birthdate = datetime.now() - relativedelta(years=age)
106106
birthdate = birthdate.strftime('%Y-%m-%d')
107107
return birthdate
108108
return ''
109-
109+
110+
def get_image(self, response):
111+
image = super().get_image(response)
112+
if "?q=" in image:
113+
image = re.search(r'(.*?)\?q=', image).group(1)
114+
return image
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import re
2+
import scrapy
3+
4+
from tpdb.BasePerformerScraper import BasePerformerScraper
5+
6+
7+
class PerformerSpider(BasePerformerScraper):
8+
selector_map = {
9+
'name': '',
10+
'image': '',
11+
'image_blob': True,
12+
'bio': '',
13+
'gender': '',
14+
'astrology': '',
15+
'birthday': '',
16+
'birthplace': '',
17+
'cupsize': '',
18+
'ethnicity': '',
19+
'eyecolor': '',
20+
'fakeboobs': '',
21+
'haircolor': '',
22+
'height': '',
23+
'measurements': '',
24+
'nationality': '',
25+
'piercings': '',
26+
'tattoos': '',
27+
'weight': '',
28+
29+
'pagination': '/models/%s/',
30+
'external_id': r'model/(.*)/'
31+
}
32+
33+
name = 'HobbyPornPerformer'
34+
network = 'HobbyPorn'
35+
36+
start_urls = [
37+
'https://hobby.porn',
38+
]
39+
40+
def get_gender(self, response):
41+
return 'Female'
42+
43+
def get_performers(self, response):
44+
performers = response.xpath('').getall()
45+
for performer in performers:
46+
yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers)
47+
48+
def get_measurements(self, response):
49+
if 'measurements' in self.selector_map:
50+
measurements = self.process_xpath(response, self.get_selector_map('measurements')).get()
51+
if measurements and re.search(r'(\d+\w+-\d+-\d+)', measurements):
52+
measurements = re.search(r'(\d+\w+-\d+-\d+)', measurements).group(1)
53+
return measurements.strip()
54+
return ''
55+
56+
def get_cupsize(self, response):
57+
if 'cupsize' in self.selector_map and self.get_selector_map('cupsize'):
58+
cupsize = self.process_xpath(response, self.get_selector_map('cupsize')).get()
59+
return cupsize.strip()
60+
else:
61+
if 'measurements' in self.selector_map:
62+
measurements = self.process_xpath(response, self.get_selector_map('measurements')).get()
63+
if measurements and re.search(r'(\d+\w+-\d+-\d+)', measurements):
64+
cupsize = re.search(r'(\d+\w+)-\d+-\d+', measurements)
65+
if cupsize:
66+
cupsize = cupsize.group(1)
67+
return cupsize.strip()
68+
return ''
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import re
2+
import scrapy
3+
import string
4+
from tpdb.BasePerformerScraper import BasePerformerScraper
5+
6+
7+
class SiteNaughtyMagPerformerPerformerSpider(BasePerformerScraper):
8+
selector_map = {
9+
'name': '//section[contains(@id, "model-page")]//h1/text()',
10+
're_name': r'(.*)\'',
11+
'image': '//meta[@property="og:image"]/@content',
12+
'image_blob': True,
13+
'bio': '',
14+
'gender': '',
15+
'astrology': '',
16+
'birthday': '',
17+
'birthplace': '',
18+
'cupsize': '//span[contains(text(), "Bra Size")]/following-sibling::span/text()',
19+
'ethnicity': '//span[contains(text(), "Ethnicity")]/following-sibling::span/text()',
20+
'eyecolor': '',
21+
'fakeboobs': '',
22+
'haircolor': '//span[contains(text(), "Hair Color")]/following-sibling::span/text()',
23+
'height': '//span[contains(text(), "Height")]/following-sibling::span/text()',
24+
'measurements': '//span[contains(text(), "Measurements")]/following-sibling::span/text()',
25+
'nationality': '',
26+
'piercings': '',
27+
'tattoos': '',
28+
'weight': '//span[contains(text(), "Weight")]/following-sibling::span/text()',
29+
30+
'pagination': '/big-boob-models/?page=%s',
31+
'external_id': r'model/(.*)/'
32+
}
33+
34+
cookies = [
35+
{"name":"cookie_consent","value":"accepted"},
36+
{"name":"essentialCookies","value":"true"},
37+
{"name":"functionalCookies","value":"false"},
38+
{"name":"analyticsCookies","value":"false"},
39+
{"name":"advertisingCookies","value":"false"},
40+
{"name":"doNotSell","value":"false"},
41+
]
42+
43+
custom_scraper_settings = {
44+
'CONCURRENT_REQUESTS': 1,
45+
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
46+
'CONCURRENT_REQUESTS_PER_IP': 1,
47+
# ~ 'DOWNLOAD_FAIL_ON_DATALOSS': True,
48+
'COMPRESSION_ENABLED': False,
49+
'RETRY_ENABLED': True,
50+
'RETRY_TIMES': 10,
51+
'RETRY_HTTP_CODES': [500, 503, 504, 400, 408, 307, 403],
52+
'HANDLE_HTTPSTATUS_LIST': [500, 503, 504, 400, 408, 307, 403],
53+
}
54+
55+
name = 'NaughtyMagPerformer'
56+
network = 'ScorePass'
57+
58+
start_urls = [
59+
'https://www.18eighteen.com',
60+
'https://www.naughtymag.com',
61+
]
62+
63+
def get_next_page_url(self, base, page):
64+
if "18eighteen" in base:
65+
pagination = "/teen-babes/?page=%s&sort=newer"
66+
if int(page) > 35:
67+
return ""
68+
if "naughtymag" in base:
69+
pagination = "/amateur-girls/?page=%s&sort=newer"
70+
return self.format_url(base, pagination % page)
71+
72+
def get_gender(self, response):
73+
return 'Female'
74+
75+
def get_performers(self, response):
76+
meta = response.meta
77+
performers = response.xpath('//div[contains(@class, "info")]/div[contains(@class, "trunc")]/a')
78+
for performer in performers:
79+
name = performer.xpath('./text()')
80+
if name:
81+
name = name.get()
82+
name = string.capwords(name.strip())
83+
if " " not in name:
84+
perf_href = performer.xpath('./@href').get()
85+
perf_id = re.search(r'/(\d+)/', perf_href).group(1)
86+
name = name + " " + perf_id
87+
meta['name'] = name
88+
89+
performer = performer.xpath('./@href').get()
90+
if "?nats" in performer:
91+
performer = re.search(r'(.*?)\?nats', performer).group(1)
92+
yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers, meta=meta)
93+
94+
def get_measurements(self, response):
95+
if 'measurements' in self.selector_map:
96+
measurements = self.process_xpath(response, self.get_selector_map('measurements')).get()
97+
if measurements and re.search(r'(\d+\w+?-\d+-\d+)', measurements):
98+
measurements = re.search(r'(\d+\w+?-\d+-\d+)', measurements).group(1)
99+
cupsize = self.get_cupsize(response)
100+
if cupsize:
101+
measurements = re.search(r'\d+\w+?(-\d+-\d+)', measurements).group(1)
102+
measurements = cupsize.upper() + measurements
103+
return measurements.strip()
104+
return ''
105+
106+
def get_height(self, response):
107+
height = super().get_height(response)
108+
if height:
109+
tot_inches = 0
110+
if re.search(r'(\d+)[\'\"]', height):
111+
feet = re.search(r'(\d+)\'', height)
112+
if feet:
113+
feet = feet.group(1)
114+
tot_inches = tot_inches + (int(feet) * 12)
115+
inches = re.search(r'\d+?\'(\d+)', height)
116+
if inches:
117+
inches = inches.group(1)
118+
inches = int(inches)
119+
tot_inches = tot_inches + inches
120+
height = str(int(tot_inches * 2.54)) + "cm"
121+
return height
122+
return None
123+
124+
def get_weight(self, response):
125+
weight = super().get_height(response)
126+
if weight:
127+
weight = re.search(r'(\d+)', weight)
128+
if weight:
129+
weight = weight.group(1)
130+
weight = str(int(int(weight) * .453592)) + "kg"
131+
return weight
132+
return None
133+
134+
def get_ethnicity(self, response):
135+
ethnicity = super().get_ethnicity(response)
136+
if "white" in ethnicity.lower():
137+
ethnicity = "Caucasian"
138+
return ethnicity
139+
140+
def get_name(self, response):
141+
name = super().get_name(response)
142+
print(f"In Here: {name}")
143+
name = name.strip()
144+
if " " not in name:
145+
perfid = re.search(r'/(\d+)/', response.url).group(1)
146+
name = name + " " + perfid
147+
return name

scenes/NetworkPornProsAPI.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,8 @@ def get_scenes(self, response):
131131
if ("pornplus" in meta['site'] or "strippers4k" in meta['site']) and item['date'] < "2025-04-30":
132132
submit = False
133133

134+
if "facials4k" in item['site'] and "pornplus" in item['url']:
135+
submit = False
136+
134137
if submit:
135138
yield self.check_item(item, self.days)

0 commit comments

Comments
 (0)