Парсим wordstat без капчи
#coding:utf-8
from grab import Grab
import threading,Queue,re,time,random
from urllib import quote_plus
lock = threading.RLock()
queue = Queue.Queue()
class WordStat(object):
def __init__(self):
self.min = 30
self.max = 3000
self.used = []
self.cook = self.magic_cookie()
open("parsing.txt","w")
def write(self,data,name,mode='a+'):
lock.acquire()
put = open("%s.txt"%(name),mode)
put.write(data)
put.flush()
put.close()
lock.release()
def parse(self,key):
page = 1
query_safe = quote_plus(key.encode('utf-8'))
g = Grab()
for i in xrange(3):
try:
url = 'http://wordstat.yandex.ru/?cmd=words&page=%d&text=%s&geo=&text_geo=' % (page, query_safe)
g.setup(url=url,cookies=self.cook,timeout=150,)
g.request()
for tr in g.xpath('//tr[@class="tlist"]'):
items = [x.strip().encode('utf-8') for x in tr.xpath('td//text()')]
kkey = self.clear(items[1])
count = items[-1]
if (kkey not in self.used) and (int(count)>self.min):
queue.put(kkey)
self.used.append(kkey)
if (int(count)0:
self.parse(queue.get_nowait())
w = WordStat()
w.go()