diff options
Diffstat (limited to 'myproject/myproject/spiders')
-rw-r--r-- | myproject/myproject/spiders/__init__.py | 4 | ||||
-rw-r--r-- | myproject/myproject/spiders/blogitems.py | 32 | ||||
-rw-r--r-- | myproject/myproject/spiders/linkfinder.py | 18 | ||||
-rw-r--r-- | myproject/myproject/spiders/shell.py | 13 |
4 files changed, 67 insertions, 0 deletions
diff --git a/myproject/myproject/spiders/__init__.py b/myproject/myproject/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/myproject/myproject/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/myproject/myproject/spiders/blogitems.py b/myproject/myproject/spiders/blogitems.py new file mode 100644 index 0000000..805681b --- /dev/null +++ b/myproject/myproject/spiders/blogitems.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +import scrapy + +from ..items import BlogPostItem + + +class BlogItemsSpider(scrapy.Spider): + name = 'blogitems' + allowed_domains = ['codyhiar.com'] + start_urls = ['http://codyhiar.com/'] + + def __init__(self, tag=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.start_urls = ['https://www.codyhiar.com/tags/{}'.format(tag)] + + def parse(self, response): + # from scrapy.shell import inspect_response + # inspect_response(response, self) + year_archives = response.css('.archive') + for year_archive in year_archives: + year = year_archive.css('h3::text').extract_first() + posts = year_archive.css('.post-item') + for post in posts: + title = post.css('a::text').extract_first().strip() + date = post.css('.post-time::text').extract_first() + blog_post = BlogPostItem( + title=title, + date=date, + year=year + ) + print(blog_post) + diff --git a/myproject/myproject/spiders/linkfinder.py b/myproject/myproject/spiders/linkfinder.py new file mode 100644 index 0000000..04c057b --- /dev/null +++ b/myproject/myproject/spiders/linkfinder.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor + + +class LinkFinderSpider(scrapy.spiders.CrawlSpider): + name = 'linkfinder' + allowed_domains = ['www.codyhiar.com'] + start_urls = ['https://codyhiar.com/'] + + rules = (Rule(LinkExtractor(allow=()), process_links='print_internal_links'),) + + def print_internal_links(self, links): + for link in links: + if 'codyhiar.com' in link.url: + print(link.url) + return links diff --git a/myproject/myproject/spiders/shell.py b/myproject/myproject/spiders/shell.py new file mode 100644 index 0000000..a342921 --- /dev/null +++ b/myproject/myproject/spiders/shell.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""Invoke the scrapy shell from the spider.""" +import scrapy + + +class ShellSpider(scrapy.Spider): + name = 'shell' + allowed_domains = ['codyhiar.com'] + start_urls = ['http://codyhiar.com/'] + + def parse(self, response): + from scrapy.shell import inspect_response + inspect_response(response, self) |