爬取豆瓣影评数据
获取到的数据保存到EXCEL表格
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup
from openpyxl.styles import Alignment, Border, Side
from concurrent.futures import ThreadPoolExecutor
def get_comments(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
comment_items = soup.find_all(class_="comment-item")
comments = []
for item in comment_items:
commenter = item.find(class_="comment-info").a.text.strip()
comment_time = item.find(class_="comment-time").text.strip()
comment_content = item.find(class_="short").text.strip()
vote_count = item.find(class_="vote-count").text.strip()
# 添加条件判断以确保评分元素存在
rating_element = item.find(class_="rating")
if rating_element:
rating = rating_element.attrs['title']
else:
rating = "暂无评分"
comment = {
'commenter': commenter,
'comment_time': comment_time,
'comment_content': comment_content,
'vote_count': vote_count,
'rating': rating
}
comments.append(comment)
return comments
else:
print(f"请求 {url} 失败")
return []
def fetch_additional_info(card, id, limit):
start = 0
new_url = f'https://movie.douban.com/{card}/{id}/comments?start={start}&limit={limit}&status=P&sort=new_score'
comments = get_comments(new_url)
return comments
def main():
# 创建一个新的Excel工作簿
wb = Workbook()
ws = wb.active
# 设置表头
ws.append(['剧名', '评价人数', '星级', '评分', '副标题', '评论者', '评论时间', '评论内容', '点赞数', '评论评分'])
# 变量用于存储前一条数据以便比较
prev_data = None
merge_start_row = 2
merge_end_row = 2
while True:
leibie = input("请选择需要获取数据的类别(1、电影 2、电视剧):")
if leibie == '1':
api_url = 'https://m.douban.com/rexxar/api/v2/movie/recommend?'
url = 'https://movie.douban.com/explore'
break
elif leibie == '2':
api_url = 'https://m.douban.com/rexxar/api/v2/tv/recommend?'
url = 'https://tv.douban.com/tv'
break
else:
print("请输入1或者2")
while True:
try:
count = int(input("请输入需要获取影评的数量:"))
if count > 0:
break
else:
print("请输入一个大于零的整数!")
except ValueError:
print("请输入一个有效的整数!")
while True:
try:
limit = int(input("请输入需要获取的评论数: "))
if limit > 0:
break
else:
print("请输入一个大于零的整数!")
except ValueError:
print("请输入一个有效的整数!")
tags = input("请输入需要获取的标签(多个标签用英文逗号分隔,例如2024,华语):")
date = {
'refresh': 0,
'start': 0,
'count': count,
'selected_categories': '{}',
'uncollect': 'false',
'tags': tags,
'sort': 'S'
}
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cookie': 'll="118302"; bid=ka_0RRoL2UQ; ap_v=0,6.0; __utma=30149280.145861931.1711419484.1711419484.1711419484.1; __utmb=30149280.0.10.1711419484; __utmc=30149280; __utmz=30149280.1711419484.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic',
'Referer': url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306'
}
response = requests.post(api_url, data=date, headers=headers)
data = response.json()
# Function to fetch comments and append them to the worksheet
def fetch_comments_and_append(item):
nonlocal merge_start_row, merge_end_row, prev_data
tags = [tag['name'] for tag in item.get('tags', [])]
rating = item.get('rating', {}).get('value')
count = item.get('rating', {}).get('count')
star_count = item.get('rating', {}).get('star_count')
card = item.get('card', {})
id = item.get('id')
# 如果标签和评分都存在,则添加到 Excel 表中
if tags and rating:
title = item.get('title', '暂无标题')
card_subtitle = item.get('card_subtitle', '暂无副标题')
comments = fetch_additional_info(card, id, limit)
# 检查当前数据是否与前一条数据相同
if (title, count, star_count, rating, card_subtitle) == prev_data:
merge_end_row += len(comments)
else:
# 写入前一条数据并合并单元格
if prev_data:
for col in range(1, 11):
if col in [1, 2, 3, 4,5]: # 合并 '副标题' 列的单元格
ws.merge_cells(start_row=merge_start_row, start_column=col, end_row=merge_end_row, end_column=col)
for row in range(merge_start_row, merge_end_row + 1):
ws.cell(row=row, column=col).alignment = Alignment(horizontal='center',vertical='center')
# 添加边框样式
ws.cell(row=row, column=col).border = Border(left=Side(style='thin'),right=Side(style='thin'),top=Side(style='thin'),bottom=Side(style='thin'))
else:
for row in range(merge_start_row, merge_end_row + 1):
ws.cell(row=row, column=col).alignment = Alignment(horizontal='left', vertical='top')
# 添加边框样式
ws.cell(row=row, column=col).border = Border(left=Side(style='thin'),right=Side(style='thin'),top=Side(style='thin'),bottom=Side(style='thin'))
# 写入新数据
merge_start_row = ws.max_row + 1
merge_end_row = merge_start_row + len(comments) - 1
prev_data = (title, count, star_count, rating, card_subtitle)
for comment in comments:
ws.append([title, count, star_count, rating, card_subtitle,
comment['commenter'], comment['comment_time'],
comment['comment_content'], comment['vote_count'],
comment['rating']])
# 写入并合并最后一组数据
if prev_data:
for col in range(1, 11):
if col in [1, 2, 3, 4,5]: # 合并 '标题', '评价人数', '星级', '评分' 列的单元格
ws.merge_cells(start_row=merge_start_row, start_column=col, end_row=merge_end_row,end_column=col)
for row in range(merge_start_row, merge_end_row + 1):
ws.cell(row=row, column=col).alignment = Alignment(horizontal='center',vertical='center')
# 添加边框样式
ws.cell(row=row, column=col).border = Border(left=Side(style='thin'),right=Side(style='thin'),top=Side(style='thin'),bottom=Side(style='thin'))
else:
for row in range(merge_start_row, merge_end_row + 1):
ws.cell(row=row, column=col).alignment = Alignment(horizontal='left', vertical='top')
# 添加边框样式
ws.cell(row=row, column=col).border = Border(left=Side(style='thin'),right=Side(style='thin'),top=Side(style='thin'),bottom=Side(style='thin'))
# 使用线程池获取评论
with ThreadPoolExecutor(max_workers=10) as executor:
# 遍历数据,提交任务给线程池
for item in data['items']:
executor.submit(fetch_comments_and_append, item)
# 保存Excel文件
wb.save("douban.xlsx")
print("获取成功!已保存至当前目录下的 douban.xlsx 文件")
if __name__ == "__main__":
main()