aboutsummaryrefslogtreecommitdiff
path: root/web_grater/web_grater/spiders/grater_spider.py
diff options
context:
space:
mode:
Diffstat (limited to 'web_grater/web_grater/spiders/grater_spider.py')
-rw-r--r--web_grater/web_grater/spiders/grater_spider.py66
1 files changed, 66 insertions, 0 deletions
diff --git a/web_grater/web_grater/spiders/grater_spider.py b/web_grater/web_grater/spiders/grater_spider.py
new file mode 100644
index 0000000..a8780d6
--- /dev/null
+++ b/web_grater/web_grater/spiders/grater_spider.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+"""Grater Spider for parsing urls."""
+from typing import Generator
+
+from blessings import Terminal
+from scrapy.http import Request
+from scrapy.http.response.html import HtmlResponse
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import Spider
+
+t = Terminal()
+
+# The site you want to crawl
+WEBSITE = "www.example.com"
+
+
+class CustomLinkExtractor(LinkExtractor):
+ """Custom extractor for checking results."""
+
+ def extract_links(self, response):
+ """Get links from a page.
+
+ If you run the scraper and find that there is a url that is broken
+ and you want to know which page it was on you can uncomment and upate
+ the term below so that it will print pages containing that link in
+ yellow so you can find them
+ """
+ links = super().extract_links(response)
+ # for link in links:
+ # if 'insert-search-term-here' in link.url:
+ # print(f'{t.yellow}200 Page: {response.url}{t.normal}')
+ return links
+
+
+class GraterSpider(Spider):
+ """Grater Spider."""
+
+ name = "grater"
+ allowed_domains = [f"{WEBSITE}"]
+ start_urls = [f"https://{WEBSITE}"]
+
+ def __init__(self, *args, **kwargs):
+ """Init custom link extractor."""
+ self.link_extractor = CustomLinkExtractor()
+ return super().__init__(*args, **kwargs)
+
+ def parse(self, response: HtmlResponse) -> Generator:
+ """Parse a page."""
+ if response.status == 200:
+ print(f"{t.green}200 Page: {response.url}{t.normal}")
+ else:
+ print(f"{t.red}{response.status} Page: {response.url}{t.normal}")
+ # Parse all links on the current page
+ for link in self.link_extractor.extract_links(response):
+ yield Request(link.url, callback=self.parse)
+ # Parse all the images
+ img_urls = response.css("img::attr(src)").extract()
+ for img_url in img_urls:
+ yield response.follow(img_url, callback=self.parse_image_request)
+
+ def parse_image_request(self, response: HtmlResponse) -> None:
+ """Parse the request for an image."""
+ if response.status == 200:
+ print(f"{t.green}200 Image: {response.url}{t.normal}")
+ else:
+ print(f"{t.red}{response.status} Image: {response.url}{t.normal}")