python爬虫:搜狗微信公众号文章信息的采集(https://weixin.sogou.com/),保存csv文件
2021/4/8 20:25:13
本文主要是介绍python爬虫:搜狗微信公众号文章信息的采集(https://weixin.sogou.com/),保存csv文件,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
import requests from requests.exceptions import RequestException from lxml import etree import csv import re import time from urllib import parse import time def get_page(url): """ 获取网页的源代码 :param url: :return: """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def timeswitch(chuo): tupTime = time.localtime(chuo) # 秒时间戳 stadardTime = time.strftime("%Y-%m-%d %H:%M:%S", tupTime) return stadardTime def parse_page(text): """ 解析网页源代码 :param text: :return: """ html = etree.HTML(text) ''' movie_name = html.xpath("//*[@id='sogou_vr_11002601_title_0']/text()[1]") actor = html.xpath("//p[@class='star']/text()") actor = list(map(lambda item: re.sub('\s+', '', item), actor)) time = html.xpath("//p[@class='releasetime']/text()") grade1 = html.xpath("//p[@class='score']/i[@class='integer']/text()") grade2 = html.xpath("//p[@class='score']/i[@class='fraction']/text()") new = [grade1[i] + grade2[i] for i in range(min(len(grade1), len(grade2)))] ranking = html.xpath("///dd/i/text()") return zip(ranking, movie_name, actor, time, new) ''' biaotinew = list() biaoti = html.xpath("//div[@class='txt-box']/h3/a") for bt in biaoti: b = bt.xpath("string(.)") biaotinew.append(b) print(biaotinew) wangzhinew = list() base_url = 'https://weixin.sogou.com' wangzhi = html.xpath("//div[@class='txt-box']/h3//@href") for wz in wangzhi: w = "".join(list(base_url)+wangzhi) wangzhinew.append(w) print(wangzhinew) zhaiyaonew = list() zhaiyao = html.xpath("//p[@class='txt-info']") for bt in zhaiyao: b = bt.xpath("string(.)") zhaiyaonew.append(b) print(zhaiyaonew) gzh = html.xpath("//a[@class='account']/text()") print(gzh) lastnew = list() shijiannew = list() shijian = html.xpath("//div[2]/div/span") for bt in shijian: b = bt.xpath("string(.)") shijiannew.append(b) for bp in shijiannew : newstr = re.findall(r"\d+\.?\d*",bp) # ['1.45', '5', '6.45', '8.82'] lastor = ''.join(newstr) lastnew.append(timeswitch(int(lastor))) print(lastnew) return zip(biaotinew,wangzhinew,zhaiyaonew,gzh,lastnew) def change_page1(number): """ 翻页 :param number: :return: """ base_url ='https://weixin.sogou.com/weixin?oq=&query=python&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1604564741184&page=' url = base_url +str(number)+'&ie=utf8&p=40040108&dp=1&w=01015002&dr=1' return url def save_to_csv(result, filename): """ 保存 :param result: :param filename: :return: """ with open(filename, 'a',encoding='utf-8-sig',newline="") as csvfile: writer = csv.writer(csvfile, dialect='excel') writer.writerow(result) def main(): """ 主函数 :return: """ f = open('message.csv', 'a+', encoding='utf-8-sig', newline="") # newline取消空行 csv_writer = csv.writer(f) csv_writer.writerow(["文章名称","文章链接地址","摘要","公众号名称","发布时间"]) f.close() for number in range(1,6): url = change_page1(number) text = get_page(url) result = parse_page(text) for a in result: save_to_csv(a, filename='message.csv') if __name__ == '__main__': main()
这篇关于python爬虫:搜狗微信公众号文章信息的采集(https://weixin.sogou.com/),保存csv文件的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-05-08有遇到过吗?同样的规则 Excel 中 比Python 结果大
- 2024-03-30开始python成长之路
- 2024-03-29python optparse
- 2024-03-29python map 函数
- 2024-03-20invalid format specifier python
- 2024-03-18pool.map python
- 2024-03-18threads in python
- 2024-03-14python Ai 应用开发基础训练,字符串,字典,文件
- 2024-03-13id3 algorithm python
- 2024-03-13sum array elements python