当前位置:聪少自媒体网 > 今日头条 > 正文

今日头条街拍图片爬取

2020-10-05 今日头条 聪少自媒体

import re

import requests

import os

from urllib import request

import json

from mysql_tu import mysql_conn

headers = {

'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

}

for v in range(0,60,20):

url ='https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(v)

response =requests.get(url,headers=headers)

html_json_dict = response.json()

# 创建文件

if not os.path.exists('cccc'):

os.mkdir('cccc')

data_list =html_json_dict['data']

# print(data_list)

for data_item in data_list:

if 'article_url' in data_item:

article_url =data_item['article_url']

# print(article_url)

#

response=requests.get(article_url,headers=headers)

html_ee = response.text

# print(html_ee)

# html_ee=json.loads(html_str)

# print(type(html_str))

pp=r'gallery: JSON.parse((.*)),'

match_res = re.search(pp, html_ee)

# print(match_res.group(1))

if match_res:

match_str =match_res.group(1)

match_dict = json.loads(match_str)

# print(match_dict)

# print(type(match_dict))

match_dict= json.loads(match_dict)

# print(match_dict)

# print(type(match_dict))

image_dict=match_dict['sub_images']

# print(image_dict)

for v in image_dict:

image_aa =v['url']

print(image_aa)

try:

# filename='cccc/' + image_aa.split('/')[-1] + '.jpg'

filename=image_aa.split('/')[-1] + '.jpg'

# 下载图片

# request.urlretrieve(image_aa, filename)

ver= {}

ver['filename']= filename

sql ='insert into jiepai(filename) values("{filename}")'.format(**ver)

mc = mysql_conn()

mc.execute_modify_mysql(sql)

except TimeoutError:

print('下载超时')

continue

else:

print('没有那个文件')

continue

聪少爱学堂聪少
聪少爱学堂创始人,梅州市鹏鑫网络科技有限公司CEO,09年开始踏入互联网,10年互联网行业经验,资深自媒体人,自媒体优秀导师,咪挺微商团对营销引流顾问,业务包含:精准引流技术/代引流精准粉,专业小红书,知乎,微博代运营。
  • 38988文章总数
  • 1491135访问次数
  • 建站天数
  • 合作伙伴