本文共 3246 字,大约阅读时间需要 10 分钟。
今天的实战是爬取小说
# -*- coding:UTF-8 -*-import requestsimport osfrom bs4 import BeautifulSoupdef mkdir(path): folder = os.path.exists(path) folder = path if not os.path.exists('./xiaoshuo/'): os.makedirs(path) #切换到当前目录 os.chdir(path) else: os.chdir(path)def down(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36' } res = requests.get(url,headers=headers,timeout=20) res.encoding = 'gbk' html = res.text soup = BeautifulSoup(html,'lxml') # 网页链接地址 dds = soup.select('div.listmain > dl >dd')[12:-11] path = './xiaoshuo/' mkdir(path) #print(dds) for dd in dds: #文章标题 title = dd.a.get_text() #print(title) #小说详细页链接 novel_url = 'https://www.bqkan8.com' + dd.a['href'] #print(novel_url) # 对文章详细页发起请求 datail_text = requests.get(url=novel_url, headers=headers) datail_text.encoding = 'utf-8' datail_text = datail_text.text # 解析文章内容 datail_soup = BeautifulSoup(datail_text, 'lxml') div_tag = datail_soup.select_one('#content.showtxt').get_text().strip() #print(div_tag) # 解析到章节内容 content = div_tag with open( title + '.txt','w',encoding='utf-8')as f: f.write(title + ':' + content + '\n') print(title, '爬取成功!!') if __name__ == '__main__': # 首页地址 url = 'https://www.bqkan8.com/3_3047/' down(url)
文章链接地址我们可以看到dl下标签的dd里面
1.先获取第一偏文章的内容def get_one_novel(url): # 调用请求网页数据函数 response = get_response(html_url) # 转行成selector解析对象 html = resposne.text soup = BeautifulSoup(html,'lxml') # 获取小说标题 title = soup.select_one('.content > h1 ').get_text() # 获取小说内容 返回的是list div_tag = datail_soup.select_one('#content.showtxt').get_text().strip() print(title,div_tag) if __name__ == '__main__': url = 'https://www.bqkan8.com/3_3047/1356228.html' get_one_novel(url)
已经打印了第一页
要获取章节我们执行上述步骤
所有链接地址的在dl dd的标签里面所以我们遍历下就能等到了完整代码如下
# -*- coding:UTF-8 -*-import requestsimport osfrom bs4 import BeautifulSoupdef mkdir(path): folder = os.path.exists(path) folder = path if not os.path.exists('./xiaoshuo/'): os.makedirs(path) #切换到当前目录 os.chdir(path) else: os.chdir(path)def down(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36' } res = requests.get(url,headers=headers,timeout=20) res.encoding = 'gbk' html = res.text soup = BeautifulSoup(html,'lxml') # 网页链接地址 #这里面文章链接有多余的部分我通过切片进行切割掉 dds = soup.select('div.listmain > dl >dd')[12:-11] path = './xiaoshuo/' mkdir(path) #print(dds) for dd in dds: #文章标题 title = dd.a.get_text() #print(title) #小说详细页链接 novel_url = 'https://www.bqkan8.com' + dd.a['href'] #print(novel_url) # 对文章详细页发起请求 datail_text = requests.get(url=novel_url, headers=headers) datail_text.encoding = 'utf-8' datail_text = datail_text.text # 解析文章内容 datail_soup = BeautifulSoup(datail_text, 'lxml') div_tag = datail_soup.select_one('#content.showtxt').get_text().strip() #print(div_tag) # 解析到章节内容 content = div_tag with open( title + '.txt','w',encoding='utf-8')as f: f.write(title + ':' + content + '\n') print(title, '爬取成功!!') if __name__ == '__main__': # 首页地址 url = 'https://www.bqkan8.com/3_3047/' down(url)
如图所示
我就不一一贴图了感兴趣的可以对网页分析一波在对比一下代码就懂了 谢谢大家光临我的博客 点个赞在走吧转载地址:http://xoywi.baihongyu.com/