1 import requests
2 from bs4 import BeautifulSoup
3 import json,re,os
4 from urllib.parse import urlencode
5 from hashlib import md5
6 from multiprocessing.pool import Pool
7 from requests.exceptions import RequestException
8 import pymongo
9 #引入模块config中所有变量
10 from config import *
11 from json.decoder import JSONDecodeError
12
13 #声明MongoDB对象
14 client=pymongo.MongoClient(MONGO_URL,connect=False)
15 db= client[MONGO_DB]
16 #这里插入到MongoDB。
17 #保存到本地
18 def save_to_mongo(result):
19 if db[MONGO_TABLE].insert(result):
20 print('存储到MongoDB成功',result)
21 return True
22 else:
23 return False
24 #获取索引页数据
25 def get_page_index(offset,keyword):
26 data= {
27 'offset': offset,
28 'format': 'json',
29 'keyword': keyword,
30 'autoload': 'true',
31 'count': '20',
32 'cur_tab': '3',
33 'from':'gallery'
34 }
35 #将data变成请求参数
36 url='https://www.toutiao.com/search_content/?'+urlencode(data)
37 try:
38 response= requests.get(url)
39 response.raise_for_status()
40 response.encoding= response.apparent_encoding
41 return response.text
42 except RequestException:
43 print('爬取索引页失败!')
44 #解析索引页
45 def parse_page_index(html):
46 # 获取所有详情页的url
47 try:
48 # 页面是json格式的,装换成字符串格式
49 data= json.loads(html)
50 # data.keys()返回所有键名
51 if data and 'data' in data.keys():
52 for item in data.get('data'):
53 if item.get('cell_type') is not None:
54 continue
55 yield item.get('article_url')
56 except JSONDecodeError:
57 pass
58
59 def get_page_detail(url):
60 # 请求详情页的url
61
62 #这里不加 headers 是获取不到数据的。
63 headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '}
64 try:
65 response=requests.get(url,headers=headers)
66 response.raise_for_status()
67 response.encoding= response.apparent_encoding
68 return response.text
69 except RequestException:
70 print('爬取详情页失败!',url)
71 return None
72
73 def parse_page_detail(html,url):
74 '''获取详情页的标题和图片地址url'''
75
76 #利用BeautifulSoup库提取标题
77 try:
78 soup=BeautifulSoup(html,'lxml')
79 title=soup.select('title')[0].get_text()
80 if title: print(title)
81 except:
82 pass
83
84 # 利用正则表达式提取图片地址
85 images_pattern=re.compile(r'.*?gallery: JSON.parse("(.*?)")',re.S|re.M)
86 result= re.search(images_pattern,html)
87
88 if result:
89 data=json.loads(result.group(1).replace('',''))
90 #print(data)
91 if data and 'sub_images' in data.keys():
92 sub_images=data.get('sub_images')
93 #提取图片
94 images=[item.get('url') for item in sub_images]
95 #保存图片到本地
96 for image in images:
97 download_image(title,image)
98 return {'title':title,
99 'url':url,
100 'images':images}
101 else:
102 print('空')
103
104 def save_image(title,result):
105 img_path='image' + os.path.sep + title
106 if not os.path.exists(img_path):
107 os.makedirs(img_path)
108 file_path='{0}/{1}.{2}'.format(img_path,md5(result).hexdigest(),'jpg')
109 if not os.path.exists(file_path):
110 with open(file_path,'wb') as f:
111 f.write(result)
112 print("%s下载完成"%file_path)
113 else:
114 print("%s已经存在"%file_path)
115
116 def download_image(title,url):
117 try:
118 print('正在下载',url)
119 r= requests.get(url)
120 r.raise_for_status()
121 r.encoding=r.apparent_encoding
122 save_image(title,r.content)
123
124 except RequestException:
125 print('请求图片出错',url)
126 return False
127
128 def main(offset):
129 #调用函数
130 html= get_page_index(offset,KEYWORD)
131 for url in parse_page_index(html):
132 html= get_page_detail(url)
133 if html:
134 result= parse_page_detail(html,url)
135 # print(result)
136 if result: save_to_mongo(result)
137
138
139
140 if __name__=='__main__':
141 #开启多线程抓取
142 pool= Pool()
143 group=[x*20 for x in range(GROUP_START,GROUP_END+1)]
144 pool.map(main,group)
145 pool.close()