0%

Python爬虫

Python爬虫

以爬取豆瓣电影评价为例

1.准备header

def get_header(movie_id):
	UserAgent_List = [
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
	]
	header = {
        'User-agent': random.choice(UserAgent_List),
        'Host': 'movie.douban.com',
        'Referer': 'https://movie.douban.com/subject/' + movie_id + '/?from=showing',
	}
    '''
    HTTP来源地址(referer,或 HTTP referer)是HTTP表头的一个字段,用来表示从哪儿链接到目前的网页,采用的格式是URL。
    换句话说,借着HTTP来源地址,目前的网页可以检查访客从哪里而来,这也常被用来对付伪造的跨网站请求。
    '''
    time.sleep(random.randint(5, 15))  # 增加随机等待时间,防止速度太快被封
    return header

2.初始化url

def get_url(movie_id):
    base_url = 'https://movie.douban.com/subject/'
    url = base_url + movie_id + '/comments?status=P'
    return url

3.开始爬取页面

def start_spider(movie_id, url, header):
    print(url)
    cur_page = requests.get(url, header, timeout=8)
    if cur_page.status_code == 200:  # 当成功获取到网页连接时
        soup = BeautifulSoup(cur_page.text, 'html5lib',
                             from_encoding='utf-8')  # 指定使用哪种解析器: 目前支持, lxml, html5lib, 和 html.parser
        comment_block = soup.find_all('div', class_='comment-item')  # 找到所有的评论模块 包括头像和文本内容,头像用于获取用户主页,以便于查询用户信息
        for comment in comment_block:
            result_list = []
            nick_name = comment.find('div', class_='avatar').a['title']
            result_list.append(nick_name)
            user_url = comment.find('div', class_='avatar').a['href']
            result_list.append(user_url)
            # print(nick_name)
            # print(user_url)
            user_list = spider_for_user(user_url, header) # 去获取评价对应的用户基本信息
            result_list.extend(user_list)  # 在 result_list 之后续接 user_list
            # print(user_list)
            star_level = comment.find('span', class_='rating')
            if star_level:
                star_level = star_level['title']
                if star_level == '力荐':
                    star = '五星'
                elif star_level == '推荐':
                    star = '四星'
                elif star_level == '还行':
                    star = '三星'
                elif star_level == '较差':
                    star = '二星'
                elif star_level == '很差':
                    star = '一星'
                else:
                    star = ''
            else:
                star = ''
            result_list.append(star)
            # print(star)
            comment_time = comment.find('span', class_='comment-time ').text.strip()
            result_list.append(comment_time)
            # print(comment_time)
            comment_vote = comment.find('span', class_='votes')
            if comment_vote:
                comment_vote = comment_vote.text
            else:
                comment_vote = ''
            result_list.append(comment_vote)
            # print(comment_vote)
            comment_content = comment.find('span', class_='short').text
            result_list.append(comment_content)
            # 清除标点符号,方便之后分词和词频统计
            pattern = re.compile(r'[\u4e00-\u9fa5]+')
            filterdata = re.findall(pattern, comment_content)
            cleaned_comment = ''.join(filterdata)
            result_list.append(cleaned_comment)
            # print(cleaned_comment)
            # print(comment_content)
            print(result_list)
            # 创建并打开文件
            with open("G:/movie.csv", mode="a+", encoding="utf-8-sig", newline='') as csvFile:
				# encoding="utf-8-sig" python3 中文编码
                # 获得 writer对象 delimiter是分隔符 默认为 ","
                writer = csv.writer(csvFile)
                # 调用 writer的 writerow方法将 test_writer_data写入 test_writer.csv文件
                writer.writerow(result_list)
        next_page = soup.find('a', class_='next')
        if next_page:
            tmp = next_page['href'].strip().replace("&", "").split("&")[0].split("?")[
                1]  # 先移除前后空格,再去除 &,再通过 & 删除多余参数,再通过?只保留下start参数
            # tmp = start=20
            next_url = ''.join(tmp)
        if next_url:
            header = get_header(movie_id)
            # print(url + '&' + next_url)
            start_spider(movie_id, url.split("&")[0] + '&' + next_url, header)

4.爬取用户基本信息

def spider_for_user(url, header):
    user_list = []
    user_page = requests.get(url, header, timeout=8)
    if user_page.status_code == 200:  # 当成功获取到网页连接时
        soup = BeautifulSoup(user_page.text, 'html5lib',
                             from_encoding='utf-8')  # 指定使用哪种解析器: 目前支持, lxml, html5lib, 和 html.parser
        user_info = soup.find('div', class_='basic-info')  # 找到用户基本信息模块
        if user_info:
            user_img = user_info.find('img', class_='userface')['src']
            if user_img:
                user_list.append(user_img)
            else:
                user_list.append("")
            # print(user_img)
            location = user_info.find('div', class_='user-info').a
            if location:
                location = location.text.strip()
                user_list.append(location)
            else:
                user_list.append("")
            # print(location.strip())
            tmp_info = user_info.find('div', class_='pl')
            if tmp_info:
                user_name = tmp_info.text.split(" ")[0]
                create_time = tmp_info.text.split(" ")[2].split("加入")[0]
                if user_name:
                    user_list.append(user_name)
                else:
                    user_list.append("")
                if create_time:
                    user_list.append(create_time)
                else:
                    user_list.append("")
            else:
                user_list.append("")
                user_list.append("")
                # tmp_info.text : lingrui1995  2012-08-07加入
                # tmp_info.text.split(" ") : ['lingrui1995', '', '2012-08-07加入']

                # print(user_name)
                # print(create_time)
        else:
            user_list.append("")
            user_list.append("")
            user_list.append("")
            user_list.append("")
    return user_list

5.写入csv文件

if __name__ == '__main__':

    '''
    movie_id = '3878007'
    url = get_url(movie_id)
    header = get_header(movie_id)
    cookie = get_cookie()
	# 先写入 csv 文件的第一行,也就是 head_list 信息
    head_list = ['昵称', '用户首页', '用户头像', '地点', '用户名称', '注册时间', '评分', '评论时间', '评论喜欢人数', '评论内容', '评论纯文本']
    with open("G:/movie.csv", mode="w", encoding='utf-8-sig', newline='') as csvFile:
        # encoding = 'utf-8-sig' 是因为对于UTF-8编码,Excel要求BOM(字节顺序标记)写在文件的开始,否则它会假设这是ANSI编码
        # newline = '' 保证文件在写入一行之后下一行不会有多余的空行
        # mode
        # r : 以只读模式打开(缺省模式)(必须保证文件存在)
        # w : 以只写模式打开。若文件存在,则会自动清空文件,然后重新创建;若文件不存在,则新建文件。使用这个模式必须要保证文件所在目录存在,文件可以不存在。该模式下不能使用read*()方法。
        # a : 以追加模式打开。若文件存在,则会追加到文件的末尾;若文件不存在,则新建文件。该模式不能使用read*()方法。
        # 下面四个模式要和上面的模式组合使用
        # 'b':以二进制模式打开
        # 't': 以文本模式打开(缺省模式)
        # '+':以读写模式打开
        # 'U':以通用换行符模式打开
        # 常见的mode组合
        # 'r' 或 'rt': 默认模式,文本读模式
        # 'w' 或 'wt': 以文本写模式打开(打开前文件会被清空)
        # 'rb':以二进制读模式打开
        # 'ab':以二进制追加模式打开
        # 'wb':以二进制写模式打开(打开前文件会被清空)
        # 'r+':以文本读写模式打开,可以写到文件任何位置;默认写的指针开始指在文件开头, 因此会覆写文件
        # 'w+':以文本读写模式打开(打开前文件会被清空)。可以使用read * ()
        # 'a+':以文本读写模式打开(写只能写在文件末尾)。可以使用read * ()
        # 'rb+':以二进制读写模式打开
        # 'wb+':以二进制读写模式打开(打开前文件会被清空)
        # 'ab+':以二进制读写模式打开

        # 获得 writer对象
        writer = csv.writer(csvFile)
        # 调用 writer的 writerow方法将 test_writer_data写入 test_writer.csv文件
        writer.writerow(head_list)
    start_spider(movie_id, url, header)
    '''

    with open("F:\PythonProjects\movie.csv", mode="r", encoding='utf-8-sig', newline='') as csvFile:
        csv_reader = csv.reader(csvFile)
        for row in csv_reader:
            print(row)