scrappy爬取汽车信息数据
澳门新葡京官网
澳门新葡京官网
当前位置 : 澳门新葡京官网 > 澳门新葡京官网

scrappy爬取汽车信息数据

# -*- coding: utf-8 -*-
import scrapy
class CarinfosSpiderscrapy.Spider:
    name=carinfos
    start_urls = 
        http://www.xgo.com.cn/brand.html,
    

    #解析起始url,获取各个品牌链接传给parse_brand解析
    #抓取http://www.xgo.com.cn/brand/abt/等链接
    def parseself, response:
        #ID按照抓取顺序
        brands_id = 0    #品牌ID 
        types_id = 0    #车系ID
        cars_id = 0        #车型ID
        links = response.xpath//div[@class="l"]/a[1]/@href.extract
        for link in links:
            yield scrapy.Requestlink,callback = self.parse_brand,meta={brands_id:brands_id,types_id:types_id,cars_id:cars_id}

        #获取品牌下的车系
        #抓取http://www.xgo.com.cn/4990/的链接并转换为http://www.xgo.com.cn/4990/items.html
    def parse_brandself,response:
        brands_id = response.meta[brands_id]
        types_id = response.meta[types_id]
        cars_id = response.meta[cars_id]

        brand_id = brands_id
        brands_id = brands_id + 1
        brand_name = response.css.brand_logo+h1::text.extract
        brand_img = response.css.brand_logo img::attrsrc.extract
        #记录brand_id,brand_name,brand_img

        links = response.css.car-list p a::attrhref.extract
        print links
        for link in links:
            full_url = response.urljoinlink + items.html
            yield scrapy.Requestfull_url,callback = self.parse_type,meta={brand_id:brand_id,brand_name:brand_name,brands_id:brands_id,types_id:types_id,cars_id:cars_id}

    #解析车系下的车型,#抓取http://product.xgo.com.cn/other/index190852.shtml等链接
    def parse_typeself,response:
        brand_id = response.meta[brand_id]
        brand_name = response.meta[brand_name]
        type_id = types_id
        types_id = types_id + 1
        type_name = response.css.car_banner_l .num::text.extract
        #记录brand_id,brand_name,type_id,type_name

        yield scrapy.Requestfull_url,callback = self.parse_cars,meta={brand_id:brand_id,brand_name:brand_name,type_id:type_id,type_id:type_id}

    #抓取http://product.xgo.com.cn/191/190852/param.shtml等链接
    def parse_carsself,response:
        brand_id = response.meta[brand_id]
        brand_name = response.meta[brand_name]
        type_id = response.meta[type_id]
        type_name = response.meta[type_name]
        links = response.css#theanchor .car_banner_r ul li p a::attrhref.extract 
        for link in links:
            yield scrapy.Requestlink,callback = self.parse_car_link,meta={brand_id:brand_id,brand_name:brand_name,type_id:type_id,type_name:type_name}

    #解析车型的配置参数
    def parse_car_linkself,response:
        brand_id = response.meta[brand_id]
        brand_name = response.meta[brand_name]
        type_id = response.meta[type_id]
        type_name = response.meta[type_name]
        link = response.urljoinresponse.css.cxk-navbox ul li a::attrhref.extract[3]
        yield scrapy.Requestlink,callback = self.parse_car,meta={brand_id:brand_id,brand_name:brand_name,type_id:type_id,type_name:type_name}
         

    def parse_carself,response:
        manufacturers1_id = response.meta[brand_id]
        manufacturers1 = response.meta[brand_name]
        manufacturers2_id = response.meta[type_id]
        manufacturers2 = response.meta[type_name]
        car_id = cars_id
        cars_id = cars_id + 1
        #记录brand_id,type_id,car_id,car_info
        name = response.css.offer_topnav h3 a::text.extract[0]
        where = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[0]
        level = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[1]
        year = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[2]
        displacement = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[3]
        maximumSpeed = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[4]
        officialAcceleration = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[5]
        ministryOfIntegratedFuelConsumption = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[6]
        vehicleQuality = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[7]
        longHighWith = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[9]
        bodyStructure1 = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[15]
        doorNum = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[17]
        seatNum = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[18]
        mailVolume = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[19]
        model = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[22]
        intakeForm = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[24]
        fuelForm = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[38]
        fuel = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[39]
        fuleWay = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[40]
        environmentalProtection = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[43]
        powerType = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[44]
        gearbox = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[50]
        drivingMethod = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[53]
        bodyStructure2 = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[59]
        frontBrakeType = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[61]
        brakeType = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[62]
        parkingBrakeType = response.xpath//div[@id="peizhi"]//td[@class="bor-l"]/text.extract[63]
        price = response.css.cxkmoneys .cxk-jg::text.extract

写的脚本有误

该答案已被忽略,原因:不符合答题规范:内容不是答案,可用评论、投票替代,无意义的内容:赞、顶、同问等毫无意义的内容

栏目列表

广告位

澳门新葡京官网