1 files changed, 66 insertions, 0 deletions
diff --git a/web_grater/web_grater/spiders/grater_spider.py b/web_grater/web_grater/spiders/grater_spider.py
new file mode 100644
index 0000000..a8780d6
--- /dev/null
+++ b/web_grater/web_grater/spiders/grater_spider.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+"""Grater Spider for parsing urls."""
+from typing import Generator
+
+from blessings import Terminal
+from scrapy.http import Request
+from scrapy.http.response.html import HtmlResponse
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import Spider
+
+t = Terminal()
+
+# The site you want to crawl
+WEBSITE = "www.example.com"
+
+
+class CustomLinkExtractor(LinkExtractor):
+    """Custom extractor for checking results."""
+
+    def extract_links(self, response):
+        """Get links from a page.
+
+        If you run the scraper and find that there is a url that is broken
+        and you want to know which page it was on you can uncomment and upate
+        the term below so that it will print pages containing that link in
+        yellow so you can find them
+        """
+        links = super().extract_links(response)
+        # for link in links:
+        #     if 'insert-search-term-here' in link.url:
+        #         print(f'{t.yellow}200 Page: {response.url}{t.normal}')
+        return links
+
+
+class GraterSpider(Spider):
+    """Grater Spider."""
+
+    name = "grater"
+    allowed_domains = [f"{WEBSITE}"]
+    start_urls = [f"https://{WEBSITE}"]
+
+    def __init__(self, *args, **kwargs):
+        """Init custom link extractor."""
+        self.link_extractor = CustomLinkExtractor()
+        return super().__init__(*args, **kwargs)
+
+    def parse(self, response: HtmlResponse) -> Generator:
+        """Parse a page."""
+        if response.status == 200:
+            print(f"{t.green}200 Page: {response.url}{t.normal}")
+        else:
+            print(f"{t.red}{response.status} Page: {response.url}{t.normal}")
+        # Parse all links on the current page
+        for link in self.link_extractor.extract_links(response):
+            yield Request(link.url, callback=self.parse)
+        # Parse all the images
+        img_urls = response.css("img::attr(src)").extract()
+        for img_url in img_urls:
+            yield response.follow(img_url, callback=self.parse_image_request)
+
+    def parse_image_request(self, response: HtmlResponse) -> None:
+        """Parse the request for an image."""
+        if response.status == 200:
+            print(f"{t.green}200 Image: {response.url}{t.normal}")
+        else:
+            print(f"{t.red}{response.status} Image: {response.url}{t.normal}")