From 024408674ecdaa4872fc384a05fe222242b8da8c Mon Sep 17 00:00:00 2001
From: Brian Pow <brianpow@gmail.com>
Date: Sat, 10 Oct 2020 20:31:20 +0800
Subject: [PATCH] fix scraper for Belkin

---
 firmware/items.py          |   1 +
 firmware/spiders/belkin.py | 147 +++++++++++++++++--------------------
 2 files changed, 69 insertions(+), 79 deletions(-)

diff --git a/firmware/items.py b/firmware/items.py
index c123ab5..7c4f3b3 100644
--- a/firmware/items.py
+++ b/firmware/items.py
@@ -3,6 +3,7 @@
 class FirmwareImage(Item):
     category = Field(default=None)
     vendor = Field()
+    model = Field()
     product = Field(default=None)
 
     description = Field(default=None)
diff --git a/firmware/spiders/belkin.py b/firmware/spiders/belkin.py
index a7821f1..aa03a2d 100644
--- a/firmware/spiders/belkin.py
+++ b/firmware/spiders/belkin.py
@@ -4,96 +4,85 @@
 from firmware.items import FirmwareImage
 from firmware.loader import FirmwareLoader
 
-import urllib.request, urllib.parse, urllib.error
-
+import urllib.request
+import urllib.parse
+import urllib.error
+import re
 
 class BelkinSpider(Spider):
     name = "belkin"
     allowed_domains = ["belkin.com", "belkin.force.com"]
-    start_urls = ["http://www.belkin.com/us/support"]
+    start_urls = ["https://www.belkin.com/us/support-search?text=router"]
 
     def parse(self, response):
-        if not response.xpath(
-                "//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]:
-            for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract():
-                yield FormRequest.from_response(response,
-                                                formname="productSearchForm",
-                                                formdata={
-                                                    "category": category},
-                                                callback=self.parse)
-        elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]:
-            for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract():
-                yield FormRequest.from_response(response,
-                                                formname="productSearchForm",
-                                                formdata={
-                                                    "subCategory": subcategory},
-                                                callback=self.parse)
-        else:
-            for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract():
-                yield Request(
-                    url=urllib.parse.urljoin(
-                        response.url, "/us/support-product?pid=%s" % (product)),
-                    headers={"Referer": response.url},
-                    callback=self.parse_product)
+        yield from response.follow_all(css="a.prodPageLink", callback=self.parse_product)
 
+    #https://www.belkin.com/us/support-product?pid=01t80000003L8FDAA0
     def parse_product(self, response):
-        for item in response.xpath("//div[@id='main-content']//a"):
-            if "firmware" in item.xpath(".//text()").extract()[0].lower():
-                yield Request(
-                    url=urllib.parse.urljoin(
-                        response.url, item.xpath(".//@href").extract()[0]),
-                    headers={"Referer": response.url},
-                    meta={"product": response.xpath("//p[@class='product-part-number']/text()").extract()[0].split(' ')[-1]},
-                    callback=self.parse_download)
-
-    def parse_download(self, response):
-        iframe = response.xpath(
-            "//div[@id='main-content']/iframe/@src").extract()
-
-        if iframe:
-            yield Request(
-                url=iframe[0],
-                headers={"Referer": response.url},
-                meta={"product": response.meta["product"]},
-                callback=self.parse_redirect)
-
-    def parse_redirect(self, response):
-        for text in response.body.split('\''):
-            if "articles/" in text.lower() and "download/" in text.lower():
-                yield Request(
-                    url=urllib.parse.urljoin(response.url, text),
-                    headers={"Referer": response.url},
-                    meta={"product": response.meta["product"]},
-                    callback=self.parse_kb)
+        product = response.css(
+            "div.support-product-details-block h1::text").get()
+        yield from response.follow_all(css="div.support-product-details-block a[title='Downloads / Firmware']", meta={"product": product}, callback=self.parse_product_firmware)
 
-    def parse_kb(self, response):
-        # initial html tokenization to find regions segmented by e.g. "======"
-        # or "------"
-        filtered = response.xpath(
-            "//div[@class='sfdc_richtext']").extract()[0].split("=-")
+    #https://www.belkin.com/us/support-article?articleNum=105643
+    #https://www.belkin.com/us/support-article?articleNum=4929
+    def parse_product_firmware(self, response):
+        version = ""
+        build = ""
+        url = ""
+        size = ""
+        description = ""
 
-        for entry in [x and x.strip() for x in filtered]:
-            resp = HtmlResponse(url=response.url, body=entry,
-                                encoding=response.encoding)
+        divs = response.css("#support-article-downloads > div")
+        model = response.css("h1::text").get().replace(" Downloads","")
+        for div in divs:
+            if div.css("h2"):
+                version = div.xpath(".//h2/*/text()").get().replace("Versin ","")
+            elif div.css("h3"):
+                for el in div.xpath(".//*"):
+                    tag = el.xpath("name()").get()
+                    if tag == "h3":
+                        res_type = el.xpath(".//text()").get()
+                        self.logger.debug("%s: %s=%s" % (response.meta['product'], tag, res_type))
 
-            for link in resp.xpath("//a"):
-                href = link.xpath("@href").extract()[0]
-                if "cache-www" in href:
-                    text = resp.xpath("//text()").extract()
-                    text_next = link.xpath("following::text()").extract()
+                    elif tag == "span" or tag == "div":
+                        tmp=el.xpath(".//a/@href").get()
+                        if tmp:
+                            url = tmp
+                        for text in el.xpath(".//text()").getall():
+                            text=text.strip()
+                            matches = re.match(r"Ver\. ([\d\.]+)",text)
+                            if matches:
+                                build=matches[1]
+                            matches = re.match(r"(\d+) [KMG]B",text)
 
-                    item = FirmwareLoader(item=FirmwareImage(),
-                                          response=response,
-                                          date_fmt=["%b %d, %Y", "%B %d, %Y",
-                                                    "%m/%d/%Y"])
+                            if matches:
+                                size=matches[1]
+                        self.logger.debug("%s: %s=%s" % (response.meta['product'], tag, url))
 
-                    version = FirmwareLoader.find_version_period(text_next)
-                    if not version:
-                        version = FirmwareLoader.find_version_period(text)
+                    elif tag == "ul":
+                        description = res_type + "\n" + "\n".join(el.xpath(".//li//text()").getall())
+                        self.logger.debug("%s: %s=%s" % (response.meta['product'], tag, description))
+                        item = FirmwareLoader(item=FirmwareImage(),
+                                            response=response,
+                                            date_fmt=["%b %d, %Y", "%B %d, %Y",
+                                                        "%m/%d/%Y"])
+                        item.add_value("version", version)
+                        item.add_value("model", model)
+                        item.add_value("build", build)
+                        item.add_value("url", url)
+                        item.add_value("size", size)
+                        item.add_value("description", description)
+                        item.add_value("product", response.meta["product"])
+                        item.add_value("vendor", self.name)
+                        build = ""
+                        url = ""
+                        size = ""
+                        description = ""
 
-                    item.add_value("version", version)
-                    item.add_value("date", item.find_date(text))
-                    item.add_value("url", href)
-                    item.add_value("product", response.meta["product"])
-                    item.add_value("vendor", self.name)
-                    yield item.load_item()
+                        yield item.load_item()
+                    elif tag == "a":
+                        url = el.xpath("@href").get()
+                    elif tag == "br":
+                        self.logger.debug("%s: %s=%s" % (response.meta['product'], tag, ""))
+                    else:
+                        self.logger.warn("%s: %s=%s" % (response.meta['product'], tag, el.xpath(".//text()").get()))
\ No newline at end of file