#coding=utf-8
#今日头条
from lxml import etree
import requests
import urllib2,urllib
def get_url():
url ='https://www.toutiao.com/ch/news_hot/'
global count
try:
headers = {
'Host': 'www.toutiao.com',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; 125LA; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)',
'Connection': 'Keep-Alive',
'Content-Type': 'text/plain; Charset=UTF-8',
'Accept': '*/*',
'Accept-Language': 'zh-cn',
'cookie':'__tasessionId=u690hhtp21501983729114;cp=59861769FA4FFE1'}
response =requests.get(url,headers= headers)
print response.status_code
html = response.content
#print html
tree= etree.HTML(html)
title =tree.xpath('//a[@class="link title"]/text()')
source =tree.xpath('//a[@class="lbtn source"]/text()')
comment =tree.xpath('//a[@class="lbtn comment"]/text()')
stime =tree.xpath('//span[@class="lbtn"]/text()')
print len(title) #0
print type(title) #
for x,y,z,q in zip(title,source,comment,stime):
count +=1
data = {
'title':x.text,
'source':y.text,
'comment':z.text,
'stime':q.text}
print count,'|',data
except urllib2.URLError, e:
print e.reason
if __name__=='__main__':
count = 0
get_url()