diff options
Diffstat (limited to 'myproject/myproject/spiders/blogitems.py')
-rw-r--r-- | myproject/myproject/spiders/blogitems.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/myproject/myproject/spiders/blogitems.py b/myproject/myproject/spiders/blogitems.py new file mode 100644 index 0000000..805681b --- /dev/null +++ b/myproject/myproject/spiders/blogitems.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +import scrapy + +from ..items import BlogPostItem + + +class BlogItemsSpider(scrapy.Spider): + name = 'blogitems' + allowed_domains = ['codyhiar.com'] + start_urls = ['http://codyhiar.com/'] + + def __init__(self, tag=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.start_urls = ['https://www.codyhiar.com/tags/{}'.format(tag)] + + def parse(self, response): + # from scrapy.shell import inspect_response + # inspect_response(response, self) + year_archives = response.css('.archive') + for year_archive in year_archives: + year = year_archive.css('h3::text').extract_first() + posts = year_archive.css('.post-item') + for post in posts: + title = post.css('a::text').extract_first().strip() + date = post.css('.post-time::text').extract_first() + blog_post = BlogPostItem( + title=title, + date=date, + year=year + ) + print(blog_post) + |