python爬虫入门与综合应用
2021/6/27 1:14:39
本文主要是介绍python爬虫入门与综合应用,对大家解决编程问题具有一定的参考价值,需要的程序猿们随着小编来一起学习吧!
1.练习一:百度首页进行数据
import requests # 发出http请求 re=requests.get("https://www.baidu.com") # 查看响应状态 print(re.status_code) #输出:200 #200就是响应的状态码,表示请求成功 #我们可以通过res.status_code的值来判断请求是否成功。
2.用爬虫下载孔乙己的文章,网址【https://apiv3.shanbay.com/codetime/articles/mnvdu】
import requests # 发出http请求 re = requests.get('https://apiv3.shanbay.com/codetime/articles/mnvdu') # 查看响应状态 print('网页的状态码为%s'%re.status_code) with open('鲁迅文章.txt', 'w') as file: # 将数据的字符串形式写入文件中 print('正在爬取小说') file.write(re.text)
3.下载dataware的logo:
【re.content用于图片、视频、音频等内容的获取、下载】
import requests # 发出http请求 #下载图片 res=requests.get('https://www.www.zyiz.net/i/ll/?i=20210424184053989.PNG') # 以二进制写入的方式打开一个名为 info.jpg 的文件 with open('datawhale.png','wb') as ff: # 将数据的二进制形式写入文件中 ff.write(res.content)
4.用 BeautifulSoup解析HTML网页
我们来解析豆瓣读书 Top250
import io import sys import requests from bs4 import BeautifulSoup ###运行出现乱码时可以修改编码方式 #sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') ### headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' } res = requests.get('https://book.douban.com/top250', headers=headers) soup = BeautifulSoup(res.text, 'lxml') print(soup)
5.自如公寓数据抓取
import requests from bs4 import BeautifulSoup import random import time import csv user_agent = [ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)"] #爬取自如前50页 def get_info(): csvheader=['名称','面积','朝向','户型','位置','楼层','是否有电梯','建成时间',' 门锁','绿化'] with open('wuhan_ziru.csv', 'a+', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(csvheader) for i in range(1,50): #总共有50页 print('正在爬取自如第%s页'%i) timelist=[1,2,3] print('有点累了,需要休息一下啦(¬㉨¬)') time.sleep(random.choice(timelist)) #休息1-3秒,防止给对方服务器过大的压力!!! url='https://wh.ziroom.com/z/p%s/'%i headers = {'User-Agent': random.choice(user_agent)} r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, 'lxml') all_info = soup.find_all('div', class_='info-box') print('开始干活咯(๑><๑)') for info in all_info: href = info.find('a') if href !=None: href='https:'+href['href'] try: print('正在爬取%s'%href) house_info=get_house_info(href) writer.writerow(house_info) except: print('出错啦,%s进不去啦( •̥́ ˍ •̀ू )'%href) def get_info(): csvheader=['名称','面积','朝向','户型','位置','楼层','是否有电梯','建成时间',' 门锁','绿化'] with open('wuhan_ziru.csv', 'a+', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(csvheader) for i in range(1,50): #总共有50页 print('正在爬取自如第%s页'%i) timelist=[1,2,3] print('有点累了,需要休息一下啦(¬㉨¬)') time.sleep(random.choice(timelist)) #休息1-3秒,防止给对方服务器过大的压力!!! url='https://wh.ziroom.com/z/p%s/'%i headers = {'User-Agent': random.choice(user_agent)} r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, 'lxml') all_info = soup.find_all('div', class_='info-box') print('开始干活咯(๑><๑)') for info in all_info: href = info.find('a') if href !=None: href='https:'+href['href'] try: print('正在爬取%s'%href) house_info=get_house_info(href) writer.writerow(house_info) except: print('出错啦,%s进不去啦( •̥́ ˍ •̀ू )'%href) def get_house_info(href): #得到房屋的信息 time.sleep(1) headers = {'User-Agent': random.choice(user_agent)} response = requests.get(url=href, headers=headers) response=response.content.decode('utf-8', 'ignore') soup = BeautifulSoup(response, 'lxml') name = soup.find('h1', class_='Z_name').text sinfo=soup.find('div', class_='Z_home_b clearfix').find_all('dd') area=sinfo[0].text orien=sinfo[1].text area_type=sinfo[2].text dinfo=soup.find('ul',class_='Z_home_o').find_all('li') location=dinfo[0].find('span',class_='va').text loucen=dinfo[1].find('span',class_='va').text dianti=dinfo[2].find('span',class_='va').text niandai=dinfo[3].find('span',class_='va').text mensuo=dinfo[4].find('span',class_='va').text lvhua=dinfo[5].find('span',class_='va').text ['名称','面积','朝向','户型','位置','楼层','是否有电梯','建成时间',' 门锁','绿化'] room_info=[name,area,orien,area_type,location,loucen,dianti,niandai,mensuo,lvhua] return room_info if __name__ == '__main__': get_info()
6.36kr信息抓取与邮件发送
import requests import random from bs4 import BeautifulSoup import smtplib # 发送邮件模块 from email.mime.text import MIMEText # 定义邮件内容 from email.header import Header # 定义邮件标题 smtpserver = 'smtp.qq.com' # 发送邮箱用户名密码 user = '' password = '' # 发送和接收邮箱 sender = '' receive = '' user_agent = [ "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)"] def main(): print('正在爬取数据') url = 'https://36kr.com/newsflashes' headers = {'User-Agent': random.choice(user_agent)} response = requests.get(url, headers=headers) response=response.content.decode('utf-8', 'ignore') soup = BeautifulSoup(response, 'lxml') news = soup.find_all('a', class_='item-title') news_list=[] for i in news: title=i.get_text() href='https://36kr.com'+i['href'] news_list.append(title+'<br>'+href) info='<br></br>'.join(news_list) print('正在发送信息') send_email(info) def send_email(content): # 通过QQ邮箱发送 title='36kr快讯' subject = title msg = MIMEText(content, 'html', 'utf-8') msg['Subject'] = Header(subject, 'utf-8') msg['From'] = sender msg['To'] = receive # SSL协议端口号要使用465 smtp = smtplib.SMTP_SSL(smtpserver, 465) # 这里是服务器端口! # HELO 向服务器标识用户身份 smtp.helo(smtpserver) # 服务器返回结果确认 smtp.ehlo(smtpserver) # 登录邮箱服务器用户名和密码 smtp.login(user, password) smtp.sendmail(sender, receive, msg.as_string()) smtp.quit() if __name__ == '__main__': main()
注:源自dataware:Datawhale自动化办公课程
这篇关于python爬虫入门与综合应用的文章就介绍到这儿,希望我们推荐的文章对大家有所帮助,也希望大家多多支持为之网!
- 2024-05-08有遇到过吗?同样的规则 Excel 中 比Python 结果大
- 2024-03-30开始python成长之路
- 2024-03-29python optparse
- 2024-03-29python map 函数
- 2024-03-20invalid format specifier python
- 2024-03-18pool.map python
- 2024-03-18threads in python
- 2024-03-14python Ai 应用开发基础训练,字符串,字典,文件
- 2024-03-13id3 algorithm python
- 2024-03-13sum array elements python