抓取新闻列表搞定了
但是单独进入个别新闻页面时 抓取就被拒绝了
各种hearder都加了,还是被拒绝。
怎么办呢? CODE如下
import requests
import csv
from bs4 import BeautifulSoup
import urllib2
import urllib
import cookielib
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
headers["Accept"]="text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Referer"]="https://tw.news.yahoo.com/sports/"
headers["Accept-Encoding"]="gzip, deflate, sdch, br"
headers["Accept-Language"]="zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2"
headers["upgrade-insecure-requests"]="1"
payload1 = {}
urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
url="https://tw.news.yahoo.com/%E4%B8%8D%E9%87%8D%E5%BB%BA%E4%BA%86-%E5%85%89%E8%8A%92%E5%8F%AF%E8%83%BD%E6%8B%9B%E6%94%AC%E5%B7%B4%E6%8F%90%E6%96%AF%E5%A1%94-072000742.html"
res = requests.post(url, headers=headers, data=payload1, stream=True)
res.encoding='utf-8'
#print "res text = " + res.text
soup = BeautifulSoup(res.text, "html.parser")
print "url is " + url
item = soup.find('div')
print soup
print headers
得到的结果
url is https://tw.news.yahoo.com/%E4%B8%8D%E9%87%8D%E5%BB%BA%E4%BA%86-%E5%85%89%E8%8A%92%E5%8F%AF%E8%83%BD%E6%8B%9B%E6%94%AC%E5%B7%B4%E6%8F%90%E6%96%AF%E5%A1%94-072000742.html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Access Denied</title>
</meta></head>
<body>
<h1>Access Denied</h1>
<!