I wrote a python scrapy code to do this. Here is the code:
import scrapy
class QuotesSpider(scrapy.Spider):
name="quotes"
def start_requests(self):
yield scrapy.FormRequest(url='http://weixin.sogou.com/weixin',
formdata={'type':'2',
'ie':'utf8',
'query':"the shape of water",
'tsn':'1',
'ft':'',
'et':'',
'interation':'',
'wxid':'',
'usip':''},
method='get',callback=self.parse)
def parse(self, response):
filename="quotes.html"
with open(filename,"wb") as f:
f.write(response.body)
self.log("Saved file %s" % filename)
When I run the code, it is always redirected to homepage, wexin.sogou.com
Then I tried many methods. At last, someone suggested use http://weixin.sogou.com/weixin?type=2 as referrer on headers. I tried it. Works. Won't redirect anymore.
import scrapy
import time
from bs4 import BeautifulSoup
class QuotesSpider(scrapy.Spider):
name="quotes"
headers = {
'Cookie': Cookie,
"User-Agent": UA,
"Referer": "http://weixin.sogou.com/weixin?type=2"
}
def start_requests(self,filename=None):
yield scrapy.http.FormRequest(url='http://weixin.sogou.com/weixin',
formdata={'type':'2',
'ie':'utf8',
'query':query,
'tsn':'1',
'ft':'',
'et':'',
# 'sst0': str(int(time.time()*1000)),
# 'page': str(1),
'interation':'',
'wxid':'',
'usip':''},
headers=self.headers,method='get', dont_filter=True,
meta = {'dont_redirect': True, "handle_httpstatus_list" : [301, 302, 303]},
callback=self.parse)
No comments:
Post a Comment