您好,登录后才能下订单哦!
密码登录
登录注册
点击 登录注册 即表示同意《亿速云用户服务条款》
# Scrapy爬取知乎中怎么模拟登录
## 前言
在当今大数据时代,网络爬虫技术已成为获取互联网数据的重要手段。知乎作为中国最大的知识分享社区,蕴含着大量有价值的用户生成内容。然而,知乎对未登录用户的访问有严格限制,很多内容需要登录后才能查看。本文将详细介绍如何使用Scrapy框架模拟登录知乎,实现完整的数据采集流程。
## 一、准备工作
### 1.1 环境配置
首先需要确保已安装以下环境:
```python
# 创建虚拟环境(推荐)
python -m venv zhihu_env
source zhihu_env/bin/activate # Linux/Mac
zhihu_env\Scripts\activate # Windows
# 安装必要包
pip install scrapy selenium pillow pytesseract
通过浏览器开发者工具分析知乎登录流程:
X-Xsrftoken_xsrf参数使用Chrome开发者工具查看登录请求:
POST https://www.zhihu.com/api/v3/oauth/sign_in HTTP/1.1
Content-Type: application/x-www-form-urlencoded
client_id=c3cef7c66a1843f8b3a9e6a1e3160e20
grant_type=password
timestamp=1634567890123
source=com.zhihu.web
signature=xxxxxx
username=your_username
password=your_password
captcha=xxxx
lang=en
utm_source=baidu
scrapy startproject zhihu_login
cd zhihu_login
scrapy genspider zhihu www.zhihu.com
zhihu_login/
├── scrapy.cfg
└── zhihu_login
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
├── __init__.py
└── zhihu.py
在zhihu.py中实现基础登录:
import json
import time
import scrapy
from urllib.parse import urlencode
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/']
# 登录API地址
login_url = 'https://www.zhihu.com/api/v3/oauth/sign_in'
def start_requests(self):
# 首先获取xsrf token
yield scrapy.Request(
url='https://www.zhihu.com/',
callback=self.get_xsrf,
dont_filter=True
)
def get_xsrf(self, response):
# 从cookie中获取_xsrf值
xsrf = response.headers.getlist('Set-Cookie')[0].decode('utf-8').split(';')[0].split('=')[1]
# 构造登录请求
formdata = {
'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
'grant_type': 'password',
'timestamp': str(int(time.time()*1000)),
'source': 'com.zhihu.web',
'username': 'your_username',
'password': 'your_password',
'captcha': '',
'lang': 'en',
'utm_source': 'baidu'
}
yield scrapy.FormRequest(
url=self.login_url,
formdata=formdata,
callback=self.after_login,
meta={'xsrf': xsrf},
headers={
'x-xsrftoken': xsrf,
'Content-Type': 'application/x-www-form-urlencoded'
}
)
def after_login(self, response):
result = json.loads(response.text)
if 'error' in result:
self.logger.error('登录失败: %s', result['error']['message'])
else:
self.logger.info('登录成功')
# 登录成功后继续爬取
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True)
知乎可能会要求验证码,需要添加验证码处理逻辑:
def get_xsrf(self, response):
xsrf = response.headers.getlist('Set-Cookie')[0].decode('utf-8').split(';')[0].split('=')[1]
# 检查是否需要验证码
captcha_url = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
yield scrapy.Request(
captcha_url,
callback=self.handle_captcha,
meta={'xsrf': xsrf},
headers={'x-xsrftoken': xsrf}
)
def handle_captcha(self, response):
result = json.loads(response.text)
xsrf = response.meta['xsrf']
if result.get('show_captcha', False):
# 需要验证码
captcha_data = self.download_captcha(xsrf)
# 这里可以接入打码平台或手动输入
captcha = input('请输入验证码: ')
else:
captcha = ''
# 继续登录流程
formdata = {
# 其他参数...
'captcha': captcha
}
# 发送登录请求...
def download_captcha(self, xsrf):
captcha_url = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
response = requests.get(
captcha_url,
headers={'x-xsrftoken': xsrf}
)
img_data = json.loads(response.text)['img_base64']
img = base64.b64decode(img_data)
with open('captcha.jpg', 'wb') as f:
f.write(img)
return img
对于复杂的JavaScript验证,可以结合Selenium:
from selenium import webdriver
from scrapy.http import HtmlResponse
class ZhihuSeleniumMiddleware:
def process_request(self, request, spider):
if request.meta.get('selenium'):
driver = webdriver.Chrome()
try:
driver.get(request.url)
# 执行登录操作
driver.find_element_by_css_selector('.SignFlow-accountInput').send_keys('your_username')
driver.find_element_by_css_selector('.SignFlow-password input').send_keys('your_password')
driver.find_element_by_css_selector('.Button.SignFlow-submitButton').click()
# 等待登录完成
time.sleep(3)
# 获取cookies
cookies = driver.get_cookies()
request.cookies = {c['name']: c['value'] for c in cookies}
# 返回新的Response
return HtmlResponse(
url=driver.current_url,
body=driver.page_source.encode('utf-8'),
encoding='utf-8',
request=request
)
finally:
driver.quit()
import pickle
class ZhihuSpider(scrapy.Spider):
# ...其他代码...
def __init__(self):
self.cookie_file = 'zhihu_cookies.pkl'
try:
with open(self.cookie_file, 'rb') as f:
self.cookies = pickle.load(f)
except:
self.cookies = None
def start_requests(self):
if self.cookies:
# 使用已有cookies
for url in self.start_urls:
yield scrapy.Request(
url,
cookies=self.cookies,
callback=self.parse,
errback=self.login,
dont_filter=True
)
else:
# 需要重新登录
yield scrapy.Request(
url='https://www.zhihu.com/',
callback=self.get_xsrf,
dont_filter=True
)
def save_cookies(self, response):
with open(self.cookie_file, 'wb') as f:
pickle.dump(response.request.cookies, f)
from scrapy.downloadermiddlewares.retry import RetryMiddleware
class CustomRetryMiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
if response.status in [403, 401]:
# 登录失效,重新登录
return self._retry(request, '登录失效', spider) or response
return super().process_response(request, response, spider)
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
DOWNLOAD_DELAY = 2
CONCURRENT_REQUESTS_PER_DOMN = 1
# 启用自定义中间件
DOWNLOADER_MIDDLEWARES = {
'zhihu_login.middlewares.ZhihuSeleniumMiddleware': 543,
'zhihu_login.middlewares.CustomRetryMiddleware': 550,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
}
# Cookie设置
COOKIES_ENABLED = True
COOKIES_DEBUG = True
class ProxyMiddleware:
def process_request(self, request, spider):
request.meta['proxy'] = 'http://your_proxy:port'
# 付费代理可能需要认证
request.headers['Proxy-Authorization'] = 'Basic ' + base64.b64encode(b'user:pass').decode()
from fake_useragent import UserAgent
class RandomUserAgentMiddleware:
def process_request(self, request, spider):
ua = UserAgent()
request.headers['User-Agent'] = ua.random
import scrapy
class ZhihuItem(scrapy.Item):
question_id = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
answer_count = scrapy.Field()
follower_count = scrapy.Field()
created_time = scrapy.Field()
updated_time = scrapy.Field()
import pymongo
class MongoPipeline:
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db['zhihu_questions'].insert_one(dict(item))
return item
_xsrf和timestamp是最新的通过本文的详细讲解,相信您已经掌握了使用Scrapy模拟登录知乎的完整流程。实际应用中可能需要根据知乎的反爬策略调整具体实现。建议在遵守知乎Robots协议的前提下合理使用爬虫技术,避免对知乎服务器造成过大压力。
注意:本文仅供技术学习参考,请勿用于非法用途。知乎数据的使用应遵守相关法律法规和网站用户协议。 “`
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。