import requests,time
from urllib.parse import urlencode
import os
from hashlib import md5
class ToutiaoSpider:
def __init__(self):
self.params= {
'offset': None,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from': 'search_tab',
'pd': 'synthesis'
}
self.ua ="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
self.headers = {
'user - agent': self.ua,
'referer': 'https://www.toutiao.com/search/?keyword=街拍',
'content-type': 'application/x-www-form-urlencoded'
}
def get_page(self,offset):
'''获取Ajax加载的数据'''
self.params['offset']= offset
p =urlencode(self.params)
url ='https://www.toutiao.com/search_content/?' + p
print(url)
try:
r =requests.get(url)
if r.status_code==200:
r.encoding='utf-8'
return r.json()
except Exception as e:
return None
def get_images(self,json_data):
'''下载图片?'''
if json_data.get('data'):
for item in json_data.get('data'):
title =item.get('title')
images =item.get('image_list')
for image in images:
items = {}
items['image']=image.get('url')
items['title']= title
yield items
def save_image(sele,item):
"""保存图片"""
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
url =item.get('image')
print('download running:','http:'+url)
r =requests.get('http:'+item.get('image'))
if r.status_code==200:
file_path ='{}/{}.{}'.format(item.get('title'),md5(r.content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as fp:
fp.write(r.content)
print('download finished')
else:
print('Already Download,',file_path)
else:
print('页面下载失败!')
except Exception as e:
print(e)
print('Failed to save Image')
def run(self,offsets):
for offset in offsets:
json_data = self.get_page(offset)
#images = self.get_images(json_data)
for item in self.get_images(json_data):
print(item)
self.save_image(item)
time.sleep(1)
if __name__=='__main__':
t = ToutiaoSpider()
offsets =([i*20 for i in range(0,15)])
t.run(offsets)