1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
| import requests from bs4 import BeautifulSoup
session = requests.session() headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'accept-language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'accept-encoding': 'gzip, deflate, br', }
def getHrefs(url): r = session.get(url,headers=headers, verify=False) r.encoding = 'GBK' soup = BeautifulSoup(r.text, 'lxml') rich_media_content = soup.find_all('div', class_='player') playDatas = [] for item in rich_media_content: playData = item.find("a").attrs['href'] playData = playData.replace('//','http://') playData = playData.replace('play','playdata') playDatas.append({ 'playData': playData, 'title': item.find("a").text })
return playDatas
def getPlayUrls(bookTitle, playDatas): if len(playDatas) == 0: return playUrls = [] for index, item in enumerate(playDatas): r = session.post(item['playData'], headers=headers, verify=False) resp = r.json() playUrls.append({ 'playUrl': resp['urlpath'].replace('flv','mp3'), 'title': '{bookTitle}-{title}'.format(bookTitle=bookTitle, title=item['title']), 'id': index, 'played': 0, }) return playUrls
if __name__ == "__main__": bookTitle = 'xxxx第六部' playDatas = getHrefs('http://xxxx.xxxx.xxx/xxxxx/') playUrls = getPlayUrls(bookTitle, playDatas) print(playUrls)
|