diff options
Diffstat (limited to 'web_grater/web_grater/spiders/grater_spider.py')
-rw-r--r-- | web_grater/web_grater/spiders/grater_spider.py | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/web_grater/web_grater/spiders/grater_spider.py b/web_grater/web_grater/spiders/grater_spider.py new file mode 100644 index 0000000..a8780d6 --- /dev/null +++ b/web_grater/web_grater/spiders/grater_spider.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +"""Grater Spider for parsing urls.""" +from typing import Generator + +from blessings import Terminal +from scrapy.http import Request +from scrapy.http.response.html import HtmlResponse +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import Spider + +t = Terminal() + +# The site you want to crawl +WEBSITE = "www.example.com" + + +class CustomLinkExtractor(LinkExtractor): + """Custom extractor for checking results.""" + + def extract_links(self, response): + """Get links from a page. + + If you run the scraper and find that there is a url that is broken + and you want to know which page it was on you can uncomment and upate + the term below so that it will print pages containing that link in + yellow so you can find them + """ + links = super().extract_links(response) + # for link in links: + # if 'insert-search-term-here' in link.url: + # print(f'{t.yellow}200 Page: {response.url}{t.normal}') + return links + + +class GraterSpider(Spider): + """Grater Spider.""" + + name = "grater" + allowed_domains = [f"{WEBSITE}"] + start_urls = [f"https://{WEBSITE}"] + + def __init__(self, *args, **kwargs): + """Init custom link extractor.""" + self.link_extractor = CustomLinkExtractor() + return super().__init__(*args, **kwargs) + + def parse(self, response: HtmlResponse) -> Generator: + """Parse a page.""" + if response.status == 200: + print(f"{t.green}200 Page: {response.url}{t.normal}") + else: + print(f"{t.red}{response.status} Page: {response.url}{t.normal}") + # Parse all links on the current page + for link in self.link_extractor.extract_links(response): + yield Request(link.url, callback=self.parse) + # Parse all the images + img_urls = response.css("img::attr(src)").extract() + for img_url in img_urls: + yield response.follow(img_url, callback=self.parse_image_request) + + def parse_image_request(self, response: HtmlResponse) -> None: + """Parse the request for an image.""" + if response.status == 200: + print(f"{t.green}200 Image: {response.url}{t.normal}") + else: + print(f"{t.red}{response.status} Image: {response.url}{t.normal}") |