import re
import requests
import os
from urllib import request
import json
from mysql_tu import mysql_conn
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for v in range(0,60,20):
url ='https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(v)
response =requests.get(url,headers=headers)
html_json_dict = response.json()
# 创建文件
if not os.path.exists('cccc'):
os.mkdir('cccc')
data_list =html_json_dict['data']
# print(data_list)
for data_item in data_list:
if 'article_url' in data_item:
article_url =data_item['article_url']
# print(article_url)
#
response=requests.get(article_url,headers=headers)
html_ee = response.text
# print(html_ee)
# html_ee=json.loads(html_str)
# print(type(html_str))
pp=r'gallery: JSON.parse((.*)),'
match_res = re.search(pp, html_ee)
# print(match_res.group(1))
if match_res:
match_str =match_res.group(1)
match_dict = json.loads(match_str)
# print(match_dict)
# print(type(match_dict))
match_dict= json.loads(match_dict)
# print(match_dict)
# print(type(match_dict))
image_dict=match_dict['sub_images']
# print(image_dict)
for v in image_dict:
image_aa =v['url']
print(image_aa)
try:
# filename='cccc/' + image_aa.split('/')[-1] + '.jpg'
filename=image_aa.split('/')[-1] + '.jpg'
# 下载图片
# request.urlretrieve(image_aa, filename)
ver= {}
ver['filename']= filename
sql ='insert into jiepai(filename) values("{filename}")'.format(**ver)
mc = mysql_conn()
mc.execute_modify_mysql(sql)
except TimeoutError:
print('下载超时')
continue
else:
print('没有那个文件')
continue