- A+
所属分类:python
python提取《妖神记》小说
看网页上看小说的体验比不上在kindle上看,但是kindle看网页的效果又不好,所以还是把网页上的小说内容提取下来,生成txt文件,放到kindle上比较合适。下面是使用python的requests库和re库,提取小说内容的简单脚本。比较简单,get_urls用于提取主页面的所有章节地址,get_info对每个章节的内容进行提取,并写入文件。
# python提取《妖神记》小说
import requests
import re
import time
# from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/59.0.3071.112 Safari/537.36 Vivaldi/1.91.867.48'
}
f = open("妖神记.txt", 'a+')
def get_urls():
res = requests.get('http://www.tsxsw.com/html/20/20079/', headers=headers)
contents=re.findall('<li>.*?<a href="(\d*?.html)">(.*?)</a>.*?</li>', res.content.decode('GBK'), re.S)
for content in contents:
f.write(content[1] + '\n')
print(content[1])
get_info('http://www.tsxsw.com/html/20/20079/{}'.format(content[0]))
time.sleep(0.5)
def get_info(url):
res = requests.get(url, headers=headers)
if res.status_code == 200:
contents = re.findall('<p>(.*?)</p>', res.content.decode('GBK'), re.S)
for content in contents:
content = content.replace(' ', ' ')
content = content.replace('<br /><br />', '\n')
f.write(content+'\n')
break
else:
pass
get_urls()
f.close()
print("完成")