class SeleniumMiddleware(object):
def process_request(self, request, spider):
spider.browser.get(request.url)
# if request.url != "https://accounts.douban.com/login":
# return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source,encoding="utf-8", request=request)
# 需要在FromRequest登陆时加入meta数据used_selenium的值为True
used_selenium = request.meta.get('used_selenium', False)
if used_selenium:
spider.browser.find_element_by_id("email").clear()
spider.browser.find_element_by_id("email").send_keys("xxxxxx")
time.sleep(2)
spider.browser.find_element_by_id("password").clear()
spider.browser.find_element_by_id("password").send_keys("xxxxxx")
time.sleep(2)
spider.browser.find_element_by_id("captcha_field").clear()
verification = input("输入验证码\n>").strip()
spider.browser.find_element_by_id("captcha_field").send_keys(verification)
spider.browser.find_element_by_class_name("btn-submit").click()
selenium_cookies = spider.browser.get_cookies()
json_cookies = json.dumps(selenium_cookies)
# 登录完成后,将cookie保存到本地文件
with open('cookies.json', 'w') as f:
f.write(json_cookies)
request.meta['usedSelenium'] = False
else:
spider.browser.delete_all_cookies()
with open('cookies.json', 'r') as f:
list_cookies = json.loads(f.read())
for cookie in list_cookies:
spider.browser.add_cookie({
'domain': 'www.douban.com', # 此处xxx.com前,需要带点
'name': cookie['name'],
'value': cookie['value'],
'secure': False,
'httpOnly': True,
'path': '/',
'expires': None
})
time.sleep(2)
return HtmlResponse(request.url, body=spider.browser.page_source, encoding='utf-8', request=request)