web_grater/web_grater/spiders/grater_spider.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

# -*- coding: utf-8 -*-
"""Grater Spider for parsing urls."""
from typing import Generator

from blessings import Terminal
from scrapy.http import Request
from scrapy.http.response.html import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider

t = Terminal()

# The site you want to crawl
WEBSITE = "www.example.com"


class CustomLinkExtractor(LinkExtractor):
    """Custom extractor for checking results."""

    def extract_links(self, response):
        """Get links from a page.

        If you run the scraper and find that there is a url that is broken
        and you want to know which page it was on you can uncomment and upate
        the term below so that it will print pages containing that link in
        yellow so you can find them
        """
        links = super().extract_links(response)
        # for link in links:
        #     if 'insert-search-term-here' in link.url:
        #         print(f'{t.yellow}200 Page: {response.url}{t.normal}')
        return links


class GraterSpider(Spider):
    """Grater Spider."""

    name = "grater"
    allowed_domains = [f"{WEBSITE}"]
    start_urls = [f"https://{WEBSITE}"]

    def __init__(self, *args, **kwargs):
        """Init custom link extractor."""
        self.link_extractor = CustomLinkExtractor()
        return super().__init__(*args, **kwargs)

    def parse(self, response: HtmlResponse) -> Generator:
        """Parse a page."""
        if response.status == 200:
            print(f"{t.green}200 Page: {response.url}{t.normal}")
        else:
            print(f"{t.red}{response.status} Page: {response.url}{t.normal}")
        # Parse all links on the current page
        for link in self.link_extractor.extract_links(response):
            yield Request(link.url, callback=self.parse)
        # Parse all the images
        img_urls = response.css("img::attr(src)").extract()
        for img_url in img_urls:
            yield response.follow(img_url, callback=self.parse_image_request)

    def parse_image_request(self, response: HtmlResponse) -> None:
        """Parse the request for an image."""
        if response.status == 200:
            print(f"{t.green}200 Image: {response.url}{t.normal}")
        else:
            print(f"{t.red}{response.status} Image: {response.url}{t.normal}")