爬取今日头条街拍美图

1 import requests

2 from bs4 import BeautifulSoup

3 import json,re,os

4 from urllib.parse import urlencode

5 from hashlib import md5

6 from multiprocessing.pool import Pool

7 from requests.exceptions import RequestException

8 import pymongo

9 #引入模块config中所有变量

10 from config import *

11 from json.decoder import JSONDecodeError

13 #声明MongoDB对象

14 client=pymongo.MongoClient(MONGO_URL,connect=False)

15 db= client[MONGO_DB]

16 #这里插入到MongoDB。

17 #保存到本地

18 def save_to_mongo(result):

19 if db[MONGO_TABLE].insert(result):

20 print('存储到MongoDB成功',result)

21 return True

22 else:

23 return False

24 #获取索引页数据

25 def get_page_index(offset,keyword):

26 data= {

27 'offset': offset,

28 'format': 'json',

29 'keyword': keyword,

30 'autoload': 'true',

31 'count': '20',

32 'cur_tab': '3',

33 'from':'gallery'

34 }

35 #将data变成请求参数

36 url='https://www.toutiao.com/search_content/?'+urlencode(data)

37 try:

38 response= requests.get(url)

39 response.raise_for_status()

40 response.encoding= response.apparent_encoding

41 return response.text

42 except RequestException:

43 print('爬取索引页失败！')

44 #解析索引页

45 def parse_page_index(html):

46 # 获取所有详情页的url

47 try:

48 # 页面是json格式的，装换成字符串格式

49 data= json.loads(html)

50 # data.keys()返回所有键名

51 if data and 'data' in data.keys():

52 for item in data.get('data'):

53 if item.get('cell_type') is not None:

54 continue

55 yield item.get('article_url')

56 except JSONDecodeError:

57 pass

59 def get_page_detail(url):

60 # 请求详情页的url

62 #这里不加 headers 是获取不到数据的。

63 headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '}

64 try:

65 response=requests.get(url,headers=headers)

66 response.raise_for_status()

67 response.encoding= response.apparent_encoding

68 return response.text

69 except RequestException:

70 print('爬取详情页失败！',url)

71 return None

73 def parse_page_detail(html,url):

74 '''获取详情页的标题和图片地址url'''

76 #利用BeautifulSoup库提取标题

77 try:

78 soup=BeautifulSoup(html,'lxml')

79 title=soup.select('title')[0].get_text()

80 if title: print(title)

81 except:

82 pass

84 # 利用正则表达式提取图片地址

85 images_pattern=re.compile(r'.*?gallery: JSON.parse("(.*?)")',re.S|re.M)

86 result= re.search(images_pattern,html)

88 if result:

89 data=json.loads(result.group(1).replace('',''))

90 #print(data)

91 if data and 'sub_images' in data.keys():

92 sub_images=data.get('sub_images')

93 #提取图片

94 images=[item.get('url') for item in sub_images]

95 #保存图片到本地

96 for image in images:

97 download_image(title,image)

98 return {'title':title,

99 'url':url,

100 'images':images}

101 else:

102 print('空')

103

104 def save_image(title,result):

105 img_path='image' + os.path.sep + title

106 if not os.path.exists(img_path):

107 os.makedirs(img_path)

108 file_path='{0}/{1}.{2}'.format(img_path,md5(result).hexdigest(),'jpg')

109 if not os.path.exists(file_path):

110 with open(file_path,'wb') as f:

111 f.write(result)

112 print("%s下载完成"%file_path)

113 else:

114 print("%s已经存在"%file_path)

115

116 def download_image(title,url):

117 try:

118 print('正在下载',url)

119 r= requests.get(url)

120 r.raise_for_status()

121 r.encoding=r.apparent_encoding

122 save_image(title,r.content)

123

124 except RequestException:

125 print('请求图片出错',url)

126 return False

127

128 def main(offset):

129 #调用函数

130 html= get_page_index(offset,KEYWORD)

131 for url in parse_page_index(html):

132 html= get_page_detail(url)

133 if html:

134 result= parse_page_detail(html,url)

135 # print(result)

136 if result: save_to_mongo(result)

137

138

139

140 if __name__=='__main__':

141 #开启多线程抓取

142 pool= Pool()

143 group=[x*20 for x in range(GROUP_START,GROUP_END+1)]

144 pool.map(main,group)

145 pool.close()

爬取今日头条街拍美图

聪少自媒体热门分类

推荐文章

热门文章

合作伙伴

关于本站