不知道为什么,注释了这一段就能输出到 pipeline
代码内容其实就是先进一个 url ,获得 cookie ,再带着 cookie 访问,把验证码下下来解析,然后再把数据 post 上去,要是验证码错了,就重复下验证码再解析,再 post ,直到 post 成功。 bug 代码在最后,重复下载验证码的部分,各位熟悉 scrapy 的大牛知道为什么吗?
# -*- coding: gbk -*- import scrapy from scrapy.http import FormRequest import json import os from datetime import datetime from scrapy.selector import Selector from teacherCourse.handlePic import handle from teacherCourse.items import DetailProfItem from teacherCourse.items import DetailProfCourseItem from teacherCourse.items import containItem class GetTeacherCourseSpider(scrapy.Spider): name = 'TeacherCourse' # custom_settings = { # 'ITEM_PIPELINES': { # 'teacherCourse.pipelines.TeacherCoursePipeline': 300, # } # } def __init__(self, selXNXQ='', titleCode=''): self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third self.findSessiOnId= None # to save the cookies self.XNXQ = selXNXQ self.titleCode = titleCode def start_requests(self): request = scrapy.Request(self.getUrl, callback = self.downloadPic) yield request def downloadPic(self, response): # download the picture # find the session id self.findSessiOnId= response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=") request = scrapy.Request(self.vcodeUrl, cookies= {self.findSessionId[0]: self.findSessionId[1]}, callback = self.getAndHandleYzm) yield request def getAndHandleYzm(self, response): yzm = handle(response.body) yield FormRequest(self.postUrl, formdata={'Sel_XNXQ': '20151', 'sel_zc': '011', 'txt_yzm': yzm, 'type': '2'}, headers={ 'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx', 'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1], }, callback=self.parse) def parse(self, response): body = response.body.decode('gbk') # bug code begin num = body.find('alert') if num != -1: # means CAPTCHA validation fails, need to re-request the CAPTCHA yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000), headers={ 'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx', 'Cookie': self.findSessionId[0]+'='+self.findSessionId[1] }, callback=self.getAndHandleYzm) # re request the url to solve the validation code fail problem # bug code done else: # parse data # self.parseData(body) item = containItem() item['first'] = len(body) return item 