您好,登录后才能下订单哦!
密码登录
登录注册
点击 登录注册 即表示同意《亿速云用户服务条款》
# Python怎么实现分类保存所有文章图片
## 引言
在信息爆炸的时代,我们每天都会接触到大量包含图片的文章内容。无论是技术博客、新闻网站还是个人文集,如何高效地提取并分类保存这些图片成为许多人的需求。本文将详细介绍如何用Python实现从文章中提取图片并按自定义规则分类保存的全过程。
## 一、需求分析与技术选型
### 1.1 核心需求
- 从HTML/网页中提取所有图片资源
- 按预设分类规则(如主题、日期、来源等)自动归档
- 支持本地和网络资源的抓取
- 保持原始图片质量不损失
### 1.2 技术栈选择
```python
主要库:
- requests/urllib:网络请求
- BeautifulSoup:HTML解析
- Pillow(PIL):图像处理
- os/pathlib:文件系统操作
pip install requests beautifulsoup4 pillow
import requests
from bs4 import BeautifulSoup
import os
def download_images(url, save_dir):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
if not os.path.exists(save_dir):
os.makedirs(save_dir)
for i, img in enumerate(img_tags):
img_url = img.get('src')
if not img_url.startswith('http'):
img_url = url + img_url if img_url.startswith('/') else url + '/' + img_url
try:
img_data = requests.get(img_url).content
with open(f"{save_dir}/image_{i}.jpg", 'wb') as f:
f.write(img_data)
except Exception as e:
print(f"下载失败 {img_url}: {str(e)}")
except Exception as e:
print(f"处理失败: {str(e)}")
def classify_by_context(img_tag):
parent = img_tag.find_parent()
h2 = parent.find_previous_sibling('h2')
if h2:
return h2.text.strip().replace(' ', '_')
return 'uncategorized'
def classify_by_alt(img_tag):
alt = img_tag.get('alt', '').lower()
categories = {
'logo': 'brand',
'graph': 'charts',
'photo': 'photography'
}
for kw in categories:
if kw in alt:
return categories[kw]
return 'others'
import cv2
import numpy as np
def classify_by_color(img_path):
img = cv2.imread(img_path)
avg_color = np.mean(img, axis=(0,1))
if avg_color[0] > 200: # 偏蓝色调
return 'cool_tone'
elif avg_color[1] > 200: # 偏绿色调
return 'nature'
else:
return 'general'
from pathlib import Path
from urllib.parse import urljoin
import hashlib
import time
class ImageClassifier:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0'
})
def get_image_hash(self, content):
return hashlib.md5(content).hexdigest()[:8]
def process_page(self, classification_rules=None):
try:
resp = self.session.get(self.base_url)
soup = BeautifulSoup(resp.text, 'html.parser')
for img in soup.find_all('img'):
img_url = img.get('src')
if not img_url:
continue
# 处理相对URL
img_url = urljoin(self.base_url, img_url)
# 获取分类
category = 'uncategorized'
if classification_rules:
for rule in classification_rules:
category = rule(img)
if category != 'uncategorized':
break
# 下载图片
try:
img_data = self.session.get(img_url, timeout=10).content
img_hash = self.get_image_hash(img_data)
timestamp = int(time.time())
save_path = Path(f"images/{category}")
save_path.mkdir(exist_ok=True)
ext = img_url.split('.')[-1].lower()
ext = ext if ext in ['jpg','png','gif'] else 'jpg'
with open(save_path/f"{timestamp}_{img_hash}.{ext}", 'wb') as f:
f.write(img_data)
except Exception as e:
print(f"下载失败 {img_url}: {str(e)}")
except Exception as e:
print(f"页面处理错误: {str(e)}")
# 使用示例
rules = [classify_by_alt, classify_by_context]
classifier = ImageClassifier("https://example.com/blog")
classifier.process_page(rules)
def check_duplicate(content, save_dir):
img_hash = hashlib.md5(content).hexdigest()
for f in os.listdir(save_dir):
if img_hash in f:
return True
return False
from tenacity import retry, stop_after_attempt
@retry(stop=stop_after_attempt(3))
def download_with_retry(url):
return requests.get(url, timeout=5)
proxies = {
'http': 'http://proxy.example.com:8080',
'https': 'https://proxy.example.com:8080'
}
response = requests.get(url, proxies=proxies)
concurrent.futures
加速from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=5) as executor:
executor.map(download_image, img_urls)
缓存机制:对已处理的URL建立缓存数据库
增量抓取:记录最后处理时间,只抓取新内容
class ImageDownloadError(Exception):
pass
def safe_download(url):
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
return resp.content
except requests.exceptions.RequestException as e:
raise ImageDownloadError(f"下载失败 {url}") from e
import logging
logging.basicConfig(
filename='image_downloader.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logging.info(f"开始处理 {url}")
def process_blog(url):
# 按年月分类
year_month = datetime.now().strftime("%Y-%m")
save_dir = f"blog_images/{year_month}"
# 添加自定义分类规则
def blog_classifier(img_tag):
if 'avatar' in img_tag.get('class', []):
return 'avatars'
return year_month
classifier = ImageClassifier(url)
classifier.process_page([blog_classifier])
news_rules = [
lambda img: 'news' if 'news' in img.find_parent('article').get('class', '') else None,
lambda img: 'ads' if 'advertisement' in img.get('alt', '').lower() else None
]
本文详细介绍了使用Python实现文章图片分类保存的完整方案,包括:
通过灵活组合这些技术,你可以构建出适应不同场景的图片收集系统。后续可以进一步扩展的功能包括: - 集成机器学习自动打标 - 增加GUI操作界面 - 开发浏览器插件版本
提示:完整项目代码已托管在GitHub: example.com/image-classifier “`
注:本文实际约2200字,由于Markdown格式的代码块和空行不计入标准字数统计,如需精确字数控制可适当增减说明性文本。
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。