您好,登录后才能下订单哦!
# 如何用Python爬取酷我音乐
## 前言
在当今数字音乐时代,音乐平台如酷我音乐拥有海量的正版音乐资源。作为Python开发者,我们可能希望获取这些音乐数据用于个人学习、数据分析或开发第三方应用。本文将详细介绍如何使用Python爬虫技术爬取酷我音乐的数据,包括歌曲信息、歌词以及音频文件等。
**请注意**:本文仅用于技术交流和学习,请遵守相关法律法规和酷我音乐的用户协议,不得将爬取的数据用于商业用途或侵犯版权。
## 一、环境准备
在开始之前,我们需要准备以下Python环境和库:
```python
# 基础库
import requests # 用于发送HTTP请求
from bs4 import BeautifulSoup # 用于解析HTML
import json # 处理JSON数据
import re # 正则表达式
import os # 文件操作
import time # 时间控制
import random # 随机数生成
# 可选高级库
import selenium # 用于处理动态加载内容
from fake_useragent import UserAgent # 生成随机User-Agent
安装这些库可以使用pip命令:
pip install requests beautifulsoup4 selenium fake-useragent
首先我们需要了解酷我音乐的网页结构:
通过分析我们发现:
- 搜索API:https://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key=关键词
- 需要处理反爬机制(Cookie、Referer、csrf等)
移动端API通常限制较少,我们可以尝试:
- 使用抓包工具(如Charles或Fiddler)分析手机APP请求
- 发现核心API:http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId=歌曲ID
def search_song(keyword):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Cookie': '你的酷我Cookie',
'Referer': 'https://www.kuwo.cn/',
'csrf': '你的CSRF Token'
}
url = f'https://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key={keyword}'
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return data['data']['list']
else:
print(f"搜索失败,状态码:{response.status_code}")
return None
def get_song_detail(music_id):
url = f'http://www.kuwo.cn/api/www/music/musicInfo?mid={music_id}'
headers = {
'User-Agent': UserAgent().random,
'Referer': f'https://www.kuwo.cn/play_detail/{music_id}'
}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
else:
print(f"获取歌曲详情失败,状态码:{response.status_code}")
return None
except Exception as e:
print(f"发生错误:{str(e)}")
return None
酷我音乐有较强的反爬措施,我们需要处理:
def get_random_headers():
ua = UserAgent()
return {
'User-Agent': ua.random,
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://www.kuwo.cn/',
}
PROXY_POOL = [
'http://123.456.789.012:8888',
'http://112.113.114.115:9999',
# 更多代理IP...
]
def get_with_proxy(url):
proxy = {'http': random.choice(PROXY_POOL)}
try:
response = requests.get(url, proxies=proxy, timeout=10)
return response
except:
return None
def safe_request(url, max_retry=3):
for i in range(max_retry):
try:
time.sleep(random.uniform(0.5, 2.0)) # 随机延迟
response = requests.get(url, headers=get_random_headers())
if response.status_code == 200:
return response
except Exception as e:
print(f"请求失败,重试 {i+1}/{max_retry}: {str(e)}")
return None
def get_audio_url(music_id):
url = f'http://www.kuwo.cn/url?format=mp3&rid={music_id}&type=convert_url3'
response = safe_request(url)
if response:
data = response.json()
return data.get('url')
return None
def download_music(music_id, save_path='./musics'):
if not os.path.exists(save_path):
os.makedirs(save_path)
audio_url = get_audio_url(music_id)
if not audio_url:
print("无法获取音频URL")
return False
try:
response = requests.get(audio_url, stream=True)
file_path = os.path.join(save_path, f'{music_id}.mp3')
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print(f"下载成功:{file_path}")
return True
except Exception as e:
print(f"下载失败:{str(e)}")
return False
def get_lyrics(music_id):
url = f'http://m.kuwo.cn/newh5/singles/songinfoandlrc?musicId={music_id}'
response = safe_request(url)
if response:
data = response.json()
if data.get('status') == 200:
return data.get('data', {}).get('lrclist', [])
return None
def save_lyrics(music_id, lyrics, save_path='./lyrics'):
if not os.path.exists(save_path):
os.makedirs(save_path)
file_path = os.path.join(save_path, f'{music_id}.lrc')
with open(file_path, 'w', encoding='utf-8') as f:
for line in lyrics:
f.write(f"[{line['time']}]{line['lineLyric']}\n")
print(f"歌词保存成功:{file_path}")
class KuWoMusicSpider:
def __init__(self):
self.session = requests.Session()
self.session.headers.update(get_random_headers())
def search(self, keyword, page=1, size=30):
"""搜索歌曲"""
params = {
'key': keyword,
'pn': page,
'rn': size
}
url = 'https://www.kuwo.cn/api/www/search/searchMusicBykeyWord'
response = self.session.get(url, params=params)
return response.json() if response.ok else None
def get_music_info(self, music_id):
"""获取歌曲详细信息"""
url = f'http://www.kuwo.cn/api/www/music/musicInfo?mid={music_id}'
response = self.session.get(url)
return response.json() if response.ok else None
def download(self, music_id, save_dir='downloads'):
"""下载音乐"""
# 获取音乐信息
info = self.get_music_info(music_id)
if not info:
return False
# 创建保存目录
artist = info['data']['artist'].replace('/', '_')
album = info['data']['album'].replace('/', '_')
save_path = os.path.join(save_dir, artist, album)
os.makedirs(save_path, exist_ok=True)
# 下载音频
audio_url = get_audio_url(music_id)
if not audio_url:
return False
file_name = f"{info['data']['name']}_{music_id}.mp3"
file_path = os.path.join(save_path, file_name)
# 下载文件
response = requests.get(audio_url, stream=True)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
# 下载歌词
lyrics = get_lyrics(music_id)
if lyrics:
lrc_file = os.path.join(save_path, f"{info['data']['name']}_{music_id}.lrc")
with open(lrc_file, 'w', encoding='utf-8') as f:
for line in lyrics:
f.write(f"[{line['time']}]{line['lineLyric']}\n")
return True
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_dynamic_content(url):
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
time.sleep(3) # 等待页面加载
page_source = driver.page_source
driver.quit()
return page_source
import aiohttp
import asyncio
async def async_fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def async_main(urls):
async with aiohttp.ClientSession() as session:
tasks = [async_fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
# MongoDB存储示例
from pymongo import MongoClient
class MusicDB:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.db = self.client['music_db']
self.collection = self.db['kuwo_music']
def save_song(self, song_data):
return self.collection.update_one(
{'rid': song_data['rid']},
{'$set': song_data},
upsert=True
)
本文详细介绍了如何使用Python爬取酷我音乐的数据,包括:
完整项目代码已上传至GitHub(示例地址)。希望本文能帮助你学习Python爬虫开发,但请务必遵守法律法规,合理使用爬虫技术。
声明:本文所有代码示例仅供学习参考,实际使用时请遵守酷我音乐的相关规定。过度爬取可能导致IP被封禁或承担法律责任。 “`
这篇文章大约4100字,涵盖了从基础到进阶的酷我音乐爬虫实现方法,包含了代码示例、技术分析和法律注意事项。如需调整内容或补充细节,可以进一步修改完善。
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。