当前位置:聪少自媒体网 > 今日头条 > 正文

爬虫实例:今日头条爬虫

2020-10-05 今日头条 聪少自媒体

#coding=utf-8

#今日头条

from lxml import etree

import requests

import urllib2,urllib

def get_url():

url ='https://www.toutiao.com/ch/news_hot/'

global count

try:

headers = {

'Host': 'www.toutiao.com',

'User-Agent': 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; 125LA; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)',

'Connection': 'Keep-Alive',

'Content-Type': 'text/plain; Charset=UTF-8',

'Accept': '*/*',

'Accept-Language': 'zh-cn',

'cookie':'__tasessionId=u690hhtp21501983729114;cp=59861769FA4FFE1'}

response =requests.get(url,headers= headers)

print response.status_code

html = response.content

#print html

tree= etree.HTML(html)

title =tree.xpath('//a[@class="link title"]/text()')

source =tree.xpath('//a[@class="lbtn source"]/text()')

comment =tree.xpath('//a[@class="lbtn comment"]/text()')

stime =tree.xpath('//span[@class="lbtn"]/text()')

print len(title) #0

print type(title) #

for x,y,z,q in zip(title,source,comment,stime):

count +=1

data = {

'title':x.text,

'source':y.text,

'comment':z.text,

'stime':q.text}

print count,'|',data

except urllib2.URLError, e:

print e.reason

if __name__=='__main__':

count = 0

get_url()

聪少爱学堂聪少
聪少爱学堂创始人,梅州市鹏鑫网络科技有限公司CEO,09年开始踏入互联网,10年互联网行业经验,资深自媒体人,自媒体优秀导师,咪挺微商团对营销引流顾问,业务包含:精准引流技术/代引流精准粉,专业小红书,知乎,微博代运营。
  • 38988文章总数
  • 1491133访问次数
  • 建站天数
  • 合作伙伴