Python爬虫
以爬取豆瓣电影评价为例
1.准备header
def get_header(movie_id):
UserAgent_List = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
]
header = {
'User-agent': random.choice(UserAgent_List),
'Host': 'movie.douban.com',
'Referer': 'https://movie.douban.com/subject/' + movie_id + '/?from=showing',
}
'''
HTTP来源地址(referer,或 HTTP referer)是HTTP表头的一个字段,用来表示从哪儿链接到目前的网页,采用的格式是URL。
换句话说,借着HTTP来源地址,目前的网页可以检查访客从哪里而来,这也常被用来对付伪造的跨网站请求。
'''
time.sleep(random.randint(5, 15)) # 增加随机等待时间,防止速度太快被封
return header
2.初始化url
def get_url(movie_id):
base_url = 'https://movie.douban.com/subject/'
url = base_url + movie_id + '/comments?status=P'
return url
3.开始爬取页面
def start_spider(movie_id, url, header):
print(url)
cur_page = requests.get(url, header, timeout=8)
if cur_page.status_code == 200: # 当成功获取到网页连接时
soup = BeautifulSoup(cur_page.text, 'html5lib',
from_encoding='utf-8') # 指定使用哪种解析器: 目前支持, lxml, html5lib, 和 html.parser
comment_block = soup.find_all('div', class_='comment-item') # 找到所有的评论模块 包括头像和文本内容,头像用于获取用户主页,以便于查询用户信息
for comment in comment_block:
result_list = []
nick_name = comment.find('div', class_='avatar').a['title']
result_list.append(nick_name)
user_url = comment.find('div', class_='avatar').a['href']
result_list.append(user_url)
# print(nick_name)
# print(user_url)
user_list = spider_for_user(user_url, header) # 去获取评价对应的用户基本信息
result_list.extend(user_list) # 在 result_list 之后续接 user_list
# print(user_list)
star_level = comment.find('span', class_='rating')
if star_level:
star_level = star_level['title']
if star_level == '力荐':
star = '五星'
elif star_level == '推荐':
star = '四星'
elif star_level == '还行':
star = '三星'
elif star_level == '较差':
star = '二星'
elif star_level == '很差':
star = '一星'
else:
star = ''
else:
star = ''
result_list.append(star)
# print(star)
comment_time = comment.find('span', class_='comment-time ').text.strip()
result_list.append(comment_time)
# print(comment_time)
comment_vote = comment.find('span', class_='votes')
if comment_vote:
comment_vote = comment_vote.text
else:
comment_vote = ''
result_list.append(comment_vote)
# print(comment_vote)
comment_content = comment.find('span', class_='short').text
result_list.append(comment_content)
# 清除标点符号,方便之后分词和词频统计
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, comment_content)
cleaned_comment = ''.join(filterdata)
result_list.append(cleaned_comment)
# print(cleaned_comment)
# print(comment_content)
print(result_list)
# 创建并打开文件
with open("G:/movie.csv", mode="a+", encoding="utf-8-sig", newline='') as csvFile:
# encoding="utf-8-sig" python3 中文编码
# 获得 writer对象 delimiter是分隔符 默认为 ","
writer = csv.writer(csvFile)
# 调用 writer的 writerow方法将 test_writer_data写入 test_writer.csv文件
writer.writerow(result_list)
next_page = soup.find('a', class_='next')
if next_page:
tmp = next_page['href'].strip().replace("&", "").split("&")[0].split("?")[
1] # 先移除前后空格,再去除 &,再通过 & 删除多余参数,再通过?只保留下start参数
# tmp = start=20
next_url = ''.join(tmp)
if next_url:
header = get_header(movie_id)
# print(url + '&' + next_url)
start_spider(movie_id, url.split("&")[0] + '&' + next_url, header)
4.爬取用户基本信息
def spider_for_user(url, header):
user_list = []
user_page = requests.get(url, header, timeout=8)
if user_page.status_code == 200: # 当成功获取到网页连接时
soup = BeautifulSoup(user_page.text, 'html5lib',
from_encoding='utf-8') # 指定使用哪种解析器: 目前支持, lxml, html5lib, 和 html.parser
user_info = soup.find('div', class_='basic-info') # 找到用户基本信息模块
if user_info:
user_img = user_info.find('img', class_='userface')['src']
if user_img:
user_list.append(user_img)
else:
user_list.append("")
# print(user_img)
location = user_info.find('div', class_='user-info').a
if location:
location = location.text.strip()
user_list.append(location)
else:
user_list.append("")
# print(location.strip())
tmp_info = user_info.find('div', class_='pl')
if tmp_info:
user_name = tmp_info.text.split(" ")[0]
create_time = tmp_info.text.split(" ")[2].split("加入")[0]
if user_name:
user_list.append(user_name)
else:
user_list.append("")
if create_time:
user_list.append(create_time)
else:
user_list.append("")
else:
user_list.append("")
user_list.append("")
# tmp_info.text : lingrui1995 2012-08-07加入
# tmp_info.text.split(" ") : ['lingrui1995', '', '2012-08-07加入']
# print(user_name)
# print(create_time)
else:
user_list.append("")
user_list.append("")
user_list.append("")
user_list.append("")
return user_list
5.写入csv文件
if __name__ == '__main__':
'''
movie_id = '3878007'
url = get_url(movie_id)
header = get_header(movie_id)
cookie = get_cookie()
# 先写入 csv 文件的第一行,也就是 head_list 信息
head_list = ['昵称', '用户首页', '用户头像', '地点', '用户名称', '注册时间', '评分', '评论时间', '评论喜欢人数', '评论内容', '评论纯文本']
with open("G:/movie.csv", mode="w", encoding='utf-8-sig', newline='') as csvFile:
# encoding = 'utf-8-sig' 是因为对于UTF-8编码,Excel要求BOM(字节顺序标记)写在文件的开始,否则它会假设这是ANSI编码
# newline = '' 保证文件在写入一行之后下一行不会有多余的空行
# mode
# r : 以只读模式打开(缺省模式)(必须保证文件存在)
# w : 以只写模式打开。若文件存在,则会自动清空文件,然后重新创建;若文件不存在,则新建文件。使用这个模式必须要保证文件所在目录存在,文件可以不存在。该模式下不能使用read*()方法。
# a : 以追加模式打开。若文件存在,则会追加到文件的末尾;若文件不存在,则新建文件。该模式不能使用read*()方法。
# 下面四个模式要和上面的模式组合使用
# 'b':以二进制模式打开
# 't': 以文本模式打开(缺省模式)
# '+':以读写模式打开
# 'U':以通用换行符模式打开
# 常见的mode组合
# 'r' 或 'rt': 默认模式,文本读模式
# 'w' 或 'wt': 以文本写模式打开(打开前文件会被清空)
# 'rb':以二进制读模式打开
# 'ab':以二进制追加模式打开
# 'wb':以二进制写模式打开(打开前文件会被清空)
# 'r+':以文本读写模式打开,可以写到文件任何位置;默认写的指针开始指在文件开头, 因此会覆写文件
# 'w+':以文本读写模式打开(打开前文件会被清空)。可以使用read * ()
# 'a+':以文本读写模式打开(写只能写在文件末尾)。可以使用read * ()
# 'rb+':以二进制读写模式打开
# 'wb+':以二进制读写模式打开(打开前文件会被清空)
# 'ab+':以二进制读写模式打开
# 获得 writer对象
writer = csv.writer(csvFile)
# 调用 writer的 writerow方法将 test_writer_data写入 test_writer.csv文件
writer.writerow(head_list)
start_spider(movie_id, url, header)
'''
with open("F:\PythonProjects\movie.csv", mode="r", encoding='utf-8-sig', newline='') as csvFile:
csv_reader = csv.reader(csvFile)
for row in csv_reader:
print(row)