The link https://www.mouthshut.com/builders-and-developers/bangalore has multiple builder names(https://www.mouthshut.com/builders-and-developers/SJR-PrimeCorp-Bangalore-reviews-925981671, https://www.mouthshut.com/builders-and-developers/Mangalam-Group-Bangalore-reviews-925995292, etc), on clicking any builder, they are many builder links (https://www.mouthshut.com/builders-and-developers/SJR-PrimeCorp-Bangalore-review-mopmlquprtn, https://www.mouthshut.com/builders-and-developers/SJR, etc) I want to extract review of tag
div[@class='user-review']/p/text()
.
# -*- coding: utf-8 -*-
import scrapy
# item class included here
class RealestatebuildersnamesSpider(scrapy.Item):
# define the fields for your item here like:
builder_review_links = scrapy.Field()
class RealestatebuildersnameSpider(scrapy.Spider):
name = 'RealEstateBuildersName'
allowed_domains = ['mouthshut.com']
start_urls = ['https://www.mouthshut.com/builders-and-developers/bangalore?page=1&sort=reviewcnt']
def parse(self, response):
builders_link = response.xpath("//div[@class='col-xs-12 col-sm-6 col-md-3 col-lg-3 prod-box-wrapper']/div[@class='prod-box grid']/div[@class='details']/div[@class='title']/a/@href").extract()
for link in builders_link:
absolute_url = link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
builder_review_links = response.xpath("//div[@class='row review-article']//div[@class='title']//a/@href").extract()
for review_link in builder_review_links:
yield scrapy.Request(review_link, callback=self.parse_review)
def parse_review(self, response):
item = RealestatebuildersnamesSpider()
review = response.xpath("//div[@class='user-review']/p/text()").extract()
item["builder_review_links"] = "".join(review)
print("reviews")
return item
How can I extract the data?