# -*- coding: utf-8 -*- """Grater Spider for parsing urls.""" from typing import Generator from blessings import Terminal from scrapy.http import Request from scrapy.http.response.html import HtmlResponse from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Spider t = Terminal() # The site you want to crawl WEBSITE = "www.example.com" class CustomLinkExtractor(LinkExtractor): """Custom extractor for checking results.""" def extract_links(self, response): """Get links from a page. If you run the scraper and find that there is a url that is broken and you want to know which page it was on you can uncomment and upate the term below so that it will print pages containing that link in yellow so you can find them """ links = super().extract_links(response) # for link in links: # if 'insert-search-term-here' in link.url: # print(f'{t.yellow}200 Page: {response.url}{t.normal}') return links class GraterSpider(Spider): """Grater Spider.""" name = "grater" allowed_domains = [f"{WEBSITE}"] start_urls = [f"https://{WEBSITE}"] def __init__(self, *args, **kwargs): """Init custom link extractor.""" self.link_extractor = CustomLinkExtractor() return super().__init__(*args, **kwargs) def parse(self, response: HtmlResponse) -> Generator: """Parse a page.""" if response.status == 200: print(f"{t.green}200 Page: {response.url}{t.normal}") else: print(f"{t.red}{response.status} Page: {response.url}{t.normal}") # Parse all links on the current page for link in self.link_extractor.extract_links(response): yield Request(link.url, callback=self.parse) # Parse all the images img_urls = response.css("img::attr(src)").extract() for img_url in img_urls: yield response.follow(img_url, callback=self.parse_image_request) def parse_image_request(self, response: HtmlResponse) -> None: """Parse the request for an image.""" if response.status == 200: print(f"{t.green}200 Image: {response.url}{t.normal}") else: print(f"{t.red}{response.status} Image: {response.url}{t.normal}")