在Python中,要实现一个使用代理IP的爬虫,你可以使用requests
库来发送HTTP请求,并使用一个代理IP服务。以下是一个简单的示例,展示了如何使用免费和付费的代理IP服务来实现一个基本的爬虫:
安装依赖库:
pip install requests
编写爬虫代码:
import requests
from bs4 import BeautifulSoup
# 代理IP列表(免费)
proxies = [
{'http': 'http://127.0.0.1:8080'},
{'http': 'http://127.0.0.1:8081'},
# 添加更多代理IP
]
def get_proxy():
return random.choice(proxies)
def fetch_url(url):
proxy = get_proxy()
try:
response = requests.get(url, proxies=proxy, timeout=5)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# 根据网页结构提取数据
title = soup.find('title').text
print(f"Title: {title}")
if __name__ == "__main__":
url = "http://example.com"
html = fetch_url(url)
if html:
parse_html(html)
注册并获取API密钥:
安装依赖库:
pip install requests
编写爬虫代码:
import requests
from bs4 import BeautifulSoup
import random
# 代理IP服务提供商的API密钥
API_KEY = "your_api_key"
PROXY_URL = "https://api.proxymesh.com/v1/getProxy"
def get_proxy():
params = {
'apiKey': API_KEY,
'protocol': 'http',
'timeout': 5
}
response = requests.get(PROXY_URL, params=params)
response.raise_for_status()
proxy = response.json().get('proxy')
return proxy
def fetch_url(url):
proxy = get_proxy()
try:
response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout=5)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# 根据网页结构提取数据
title = soup.find('title').text
print(f"Title: {title}")
if __name__ == "__main__":
url = "http://example.com"
html = fetch_url(url)
if html:
parse_html(html)
希望这些示例能帮助你实现一个使用代理IP的Python爬虫。如果你有任何具体问题或需要进一步的帮助,请随时告诉我!