您现在的位置是：家装资讯

爬虫项目01-山东省招标与采购网

2019-01-28 00:00家装资讯

简介# -*- coding: utf-8 -*- import scrapy from scrapy.cmdline import execute from urllib.parse import urljoin """ 山东省采购与招标网 """ class SdbSpider(scrapy.Spider): name = sdb allowed_domains = [www.sdbidding.org.cn] # 招标公告 #中标公示 #...

# -*- coding: utf-8 -*-
import scrapy
from scrapy.cmdline import execute
from urllib.parse import urljoin
"""
山东省采购与招标网
"""
class SdbSpider(scrapy.Spider):
 name = 'sdb'
 allowed_domains = ['www.sdbidding.org.cn']
 # 招标公告 #中标公示 #采购公告 #成交公示
 start_urls = ['http://www.sdbidding.org.cn/bulletins?infoType=11', 'http://www.sdbidding.org.cn/bulletins?infoType=12', 'http://www.sdbidding.org.cn/bulletins?infoType=13&amp;type=1','http://www.sdbidding.org.cn/bulletins?infoType=14&amp;type=2']
 def start_requests(self):
 num = 0
 for start_url in self.start_urls:
 while num&lt;=20:#默认爬取得页数,默认爬取20页
 num+=1
 yield scrapy.FormRequest (start_url,formdata={"pageNo":str(num)},callback=self.get_page)
 def get_page(self,response):
 start_url=response.url
 urls=response.xpath('//td[@class="tit"]//a//@href').extract()
 for url in urls:
 end_url=urljoin(start_url,url)
 yield scrapy.Request(url=end_url,callback=self.get_content)
 def get_content(self,response):
 #正文处理
 title = response.xpath('//h3//text()').extract()[0]
 print(title)
 ctime = response.xpath('//div[@class="detail-title"]//p//text()').extract()[0]
 print(ctime)
 content = response.xpath('//div[@class="details"]//p//text()').extract()[0]
 print(content)
 content_xml = response.xpath('//div[@class="details"]')
 print(content_xml)
if __name__ == '__main__':
 execute(["scrapy", "crawl", "sdb"])