|
| 1 | +import re |
| 2 | +import scrapy |
| 3 | +import string |
| 4 | +from tpdb.BasePerformerScraper import BasePerformerScraper |
| 5 | + |
| 6 | + |
| 7 | +class SiteNaughtyMagPerformerPerformerSpider(BasePerformerScraper): |
| 8 | + selector_map = { |
| 9 | + 'name': '//section[contains(@id, "model-page")]//h1/text()', |
| 10 | + 're_name': r'(.*)\'', |
| 11 | + 'image': '//meta[@property="og:image"]/@content', |
| 12 | + 'image_blob': True, |
| 13 | + 'bio': '', |
| 14 | + 'gender': '', |
| 15 | + 'astrology': '', |
| 16 | + 'birthday': '', |
| 17 | + 'birthplace': '', |
| 18 | + 'cupsize': '//span[contains(text(), "Bra Size")]/following-sibling::span/text()', |
| 19 | + 'ethnicity': '//span[contains(text(), "Ethnicity")]/following-sibling::span/text()', |
| 20 | + 'eyecolor': '', |
| 21 | + 'fakeboobs': '', |
| 22 | + 'haircolor': '//span[contains(text(), "Hair Color")]/following-sibling::span/text()', |
| 23 | + 'height': '//span[contains(text(), "Height")]/following-sibling::span/text()', |
| 24 | + 'measurements': '//span[contains(text(), "Measurements")]/following-sibling::span/text()', |
| 25 | + 'nationality': '', |
| 26 | + 'piercings': '', |
| 27 | + 'tattoos': '', |
| 28 | + 'weight': '//span[contains(text(), "Weight")]/following-sibling::span/text()', |
| 29 | + |
| 30 | + 'pagination': '/big-boob-models/?page=%s', |
| 31 | + 'external_id': r'model/(.*)/' |
| 32 | + } |
| 33 | + |
| 34 | + cookies = [ |
| 35 | + {"name":"cookie_consent","value":"accepted"}, |
| 36 | + {"name":"essentialCookies","value":"true"}, |
| 37 | + {"name":"functionalCookies","value":"false"}, |
| 38 | + {"name":"analyticsCookies","value":"false"}, |
| 39 | + {"name":"advertisingCookies","value":"false"}, |
| 40 | + {"name":"doNotSell","value":"false"}, |
| 41 | + ] |
| 42 | + |
| 43 | + custom_scraper_settings = { |
| 44 | + 'CONCURRENT_REQUESTS': 1, |
| 45 | + 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, |
| 46 | + 'CONCURRENT_REQUESTS_PER_IP': 1, |
| 47 | + # ~ 'DOWNLOAD_FAIL_ON_DATALOSS': True, |
| 48 | + 'COMPRESSION_ENABLED': False, |
| 49 | + 'RETRY_ENABLED': True, |
| 50 | + 'RETRY_TIMES': 10, |
| 51 | + 'RETRY_HTTP_CODES': [500, 503, 504, 400, 408, 307, 403], |
| 52 | + 'HANDLE_HTTPSTATUS_LIST': [500, 503, 504, 400, 408, 307, 403], |
| 53 | + } |
| 54 | + |
| 55 | + name = 'NaughtyMagPerformer' |
| 56 | + network = 'ScorePass' |
| 57 | + |
| 58 | + start_urls = [ |
| 59 | + 'https://www.18eighteen.com', |
| 60 | + 'https://www.naughtymag.com', |
| 61 | + ] |
| 62 | + |
| 63 | + def get_next_page_url(self, base, page): |
| 64 | + if "18eighteen" in base: |
| 65 | + pagination = "/teen-babes/?page=%s&sort=newer" |
| 66 | + if int(page) > 35: |
| 67 | + return "" |
| 68 | + if "naughtymag" in base: |
| 69 | + pagination = "/amateur-girls/?page=%s&sort=newer" |
| 70 | + return self.format_url(base, pagination % page) |
| 71 | + |
| 72 | + def get_gender(self, response): |
| 73 | + return 'Female' |
| 74 | + |
| 75 | + def get_performers(self, response): |
| 76 | + meta = response.meta |
| 77 | + performers = response.xpath('//div[contains(@class, "info")]/div[contains(@class, "trunc")]/a') |
| 78 | + for performer in performers: |
| 79 | + name = performer.xpath('./text()') |
| 80 | + if name: |
| 81 | + name = name.get() |
| 82 | + name = string.capwords(name.strip()) |
| 83 | + if " " not in name: |
| 84 | + perf_href = performer.xpath('./@href').get() |
| 85 | + perf_id = re.search(r'/(\d+)/', perf_href).group(1) |
| 86 | + name = name + " " + perf_id |
| 87 | + meta['name'] = name |
| 88 | + |
| 89 | + performer = performer.xpath('./@href').get() |
| 90 | + if "?nats" in performer: |
| 91 | + performer = re.search(r'(.*?)\?nats', performer).group(1) |
| 92 | + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers, meta=meta) |
| 93 | + |
| 94 | + def get_measurements(self, response): |
| 95 | + if 'measurements' in self.selector_map: |
| 96 | + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() |
| 97 | + if measurements and re.search(r'(\d+\w+?-\d+-\d+)', measurements): |
| 98 | + measurements = re.search(r'(\d+\w+?-\d+-\d+)', measurements).group(1) |
| 99 | + cupsize = self.get_cupsize(response) |
| 100 | + if cupsize: |
| 101 | + measurements = re.search(r'\d+\w+?(-\d+-\d+)', measurements).group(1) |
| 102 | + measurements = cupsize.upper() + measurements |
| 103 | + return measurements.strip() |
| 104 | + return '' |
| 105 | + |
| 106 | + def get_height(self, response): |
| 107 | + height = super().get_height(response) |
| 108 | + if height: |
| 109 | + tot_inches = 0 |
| 110 | + if re.search(r'(\d+)[\'\"]', height): |
| 111 | + feet = re.search(r'(\d+)\'', height) |
| 112 | + if feet: |
| 113 | + feet = feet.group(1) |
| 114 | + tot_inches = tot_inches + (int(feet) * 12) |
| 115 | + inches = re.search(r'\d+?\'(\d+)', height) |
| 116 | + if inches: |
| 117 | + inches = inches.group(1) |
| 118 | + inches = int(inches) |
| 119 | + tot_inches = tot_inches + inches |
| 120 | + height = str(int(tot_inches * 2.54)) + "cm" |
| 121 | + return height |
| 122 | + return None |
| 123 | + |
| 124 | + def get_weight(self, response): |
| 125 | + weight = super().get_height(response) |
| 126 | + if weight: |
| 127 | + weight = re.search(r'(\d+)', weight) |
| 128 | + if weight: |
| 129 | + weight = weight.group(1) |
| 130 | + weight = str(int(int(weight) * .453592)) + "kg" |
| 131 | + return weight |
| 132 | + return None |
| 133 | + |
| 134 | + def get_ethnicity(self, response): |
| 135 | + ethnicity = super().get_ethnicity(response) |
| 136 | + if "white" in ethnicity.lower(): |
| 137 | + ethnicity = "Caucasian" |
| 138 | + return ethnicity |
| 139 | + |
| 140 | + def get_name(self, response): |
| 141 | + name = super().get_name(response) |
| 142 | + print(f"In Here: {name}") |
| 143 | + name = name.strip() |
| 144 | + if " " not in name: |
| 145 | + perfid = re.search(r'/(\d+)/', response.url).group(1) |
| 146 | + name = name + " " + perfid |
| 147 | + return name |
0 commit comments