Quantcast
Channel: CodeSection,代码区,Python开发技术文章_教程 - CodeSec
Viewing all articles
Browse latest Browse all 9596

Scrapy - Scraping Different Web Pages into a Scrapy Script

$
0
0

I'm creating a web app that scrapes a long list of shoes from different websites. Here are my two individual scrapy scripts:

http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3

from scrapy import Spider from scrapy.http import Request class ShoesSpider(Spider): name = "shoes" allowed_domains = ["store.nike.com"] start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3'] def parse(self, response): shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract() for shoe in shoes: yield Request(shoe, callback=self.parse_shoes) def parse_shoes(self, response): url = response.url name = response.xpath('//*[@itemprop="name"]/text()').extract_first() price = response.xpath('//*[@itemprop="price"]/text()').extract_first() price = price.replace('$','') shoe_type = response.css('.exp-product-subtitle::text').extract_first() sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option') sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract() sizes = [s.strip() for s in sizes] yield { 'url': url, 'name' : name, 'price' : price, 'sizes' : sizes, 'shoe_type': shoe_type }

http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp

from scrapy import Spider from scrapy.http import Request class ShoesSpider(Spider): name = "shoes" allowed_domains = ["dickssportinggoods.com"] start_urls = ['http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp'] def parse(self, response): shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract() for shoe in shoes: yield Request(shoe, callback=self.parse_shoes) def parse_shoes(self, response): sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract() if sizes == []: pass url = response.url name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first() price = response.xpath('.//*[@itemprop="price"]/text()').extract_first() #shoe_type = response.css('.exp-product-subtitle::text').extract_first() yield { 'url': url, 'name' : name, 'price' : price, 'sizes' : sizes, 'shoe_type': '' }

How can I manage to put both of them together? I already went through the scrapy documentation and I haven't seen them mentioning this, it just mentions how to scrape two addresses from a root address. Thanks

Put your both domains in allowed_domains and put your both URLs in start_urls and then use simple if-else to determine what part of code to execute.

from scrapy import Spider from scrapy.http import Request class ShoesSpider(Spider): name = "shoes" allowed_domains = ["store.nike.com", "dickssportinggoods.com"] start_urls = ['http://store.nike.com/us/en_us/pw/mens-clearance-soccer-shoes/47Z7puZ896Zoi3', 'http://www.dickssportinggoods.com/products/clearance-soccer-cleats.jsp'] def parse(self, response): if "store.nike.com" in response.url: shoes = response.xpath('//*[@class="grid-item-image-wrapper sprite-sheet sprite-index-0"]/a/@href').extract() elif "dickssportinggoods.com" in response.url: shoes = response.xpath('//*[@class="fplpTitle header4"]/a/@href').extract() for shoe in shoes: yield Request(shoe, callback=self.parse_shoes) def parse_shoes(self, response): url = response.url if "store.nike.com" in response.url: name = response.xpath('//*[@itemprop="name"]/text()').extract_first() price = response.xpath('//*[@itemprop="price"]/text()').extract_first() price = price.replace('$','') shoe_type = response.css('.exp-product-subtitle::text').extract_first() sizes = response.xpath('//*[@class="nsg-form--drop-down exp-pdp-size-dropdown exp-pdp-dropdown two-column-dropdown"]/option') sizes = sizes.xpath('text()[not(parent::option/@class="exp-pdp-size-not-in-stock selectBox-disabled")]').extract() sizes = [s.strip() for s in sizes] yield { 'url': url, 'name' : name, 'price' : price, 'sizes' : sizes, 'shoe_type': shoe_type } elif "dickssportinggoods.com" in response.url: sizes = response.xpath('//*[@class="swatches clearfix"]/input/@value').extract() if sizes == []: pass url = response.url name = response.xpath('.//*[@id="PageHeading_3074457345618261107"]/h1/text()').extract_first() price = response.xpath('.//*[@itemprop="price"]/text()').extract_first() #shoe_type = response.css('.exp-product-subtitle::text').extract_first() yield { 'url': url, 'name' : name, 'price' : price, 'sizes' : sizes, 'shoe_type': '' }

Viewing all articles
Browse latest Browse all 9596

Trending Articles