完成的目标:
输入搜索的商品 以及 淘宝的已评价数目、店铺的商品描述(包括如实描述、服务态度、快递的5.0打分);
按要求,晒选出要求数量的结果,并按“物美价廉算法”排序后输出
思路:
1,利用淘宝搜索'https://s.taobao.com/search?'的价格filter 先进行价格筛选,得到结果的网站
2,用urllib打开结果网站,构造正则表达式匹配出各个商品结果的 价格、已评价数量、店铺的如实描述等信息;
并把结果保存至二维数组里。
3,利用商品及店铺信息,用“物美价廉算法”给各个商品打分
4,按打分排序, 各个信息总结果按排序输出到新建txt文件里;
并将各个商品图片下载到文件及建立相同排序开头的txt(其名字包好简要的商品信息),这样图片和商品信息同时能在一个文件夹里用大图排列看到。
5.,可以把输入的参数(价格范围等要求)以函数输入,,用pyinstaller 把整个py程序打包为EXE 就可以发布了。
如要求条件为:
reserch_goods='ssd120g' #淘宝搜索词
keyword='.'#raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
price_min=22 #价格区间
price_max=400
descripHrequ=0 # % 默认高于average, 输出结果大于此值
servHrequ=0 # % 默认高于average, 输出结果大于此值
descripNrequ=6
counts=10 #要求选出多少个商品
结果显示在results文件里


源代码如下:
# -*- coding: utf-8 -*- import urllib import urllib2 import re import time import random import os from math import log from math import log10 from math import sqrt import sys reload(sys) sys.setdefaultencoding('utf8') class counter(object): #计数器 def __init__(self): self.count = 0 self.try_time = 0 self.fail_time = 0 self.url_list = [] self.new_flag = True self.results=[] self.p=0 self.d=0 def print_counter(self): print 'try_time:', self.try_time, " get_count:" , self.count, " fail_time:",self.fail_time counter1 = counter() def post_request(url): #使用代理 proxy = {'http':'27.24.158.155:84'} proxy_support = urllib2.ProxyHandler(proxy) # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1)) opener = urllib2.build_opener(proxy_support) urllib2.install_opener(opener) #构造随机头部文件访问请求 User_Agents=["Mozilla/5.0 (windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", # "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11O" ] random_User_Agent = random.choice(User_Agents) #print random_User_Agent req =urllib2.Request(url) #!! req.add_header("User-Agent",random_User_Agent) req.add_header("GET",url) req.add_header("Referer",url) return req def recommend_rate(price,description,delivery,service,comments): #描述为绝对值 av_p=counter1.p/counter1.count av_d=counter1.d/counter1.count rate=(description/av_d)**20*(description+delivery+service)*(av_p/(price))**0.1+log((comments+5),1000) print 'all count=',counter1.count print "avrage price=",av_p,';',av_p/(price),';price',price,';comments=',comments,';descrip=',description print 'rate=',rate,'(price)yinzi',(av_p/(price))**0.1,'descrip_yinzi',(description/av_d)**20,'comments_factor=',log((comments+50),100) return rate def product_rank(list): for x in list: #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 rate=recommend_rate(x[3],x[7],x[6],x[8],x[4]) x.append(rate) def get_user_rate(item_url): #暂时未使用该功能 '''获取卖家信用情况;未登录情况不能访问,或者需要在头部文件中加入cookie。。。;''' html=urllib2.urlopen(item_url) #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm" regrex_rate='"(//.*?user\-rate.*?)"' codes= re.findall(regrex_rate,html.read()) html.close() user_rate_url= 'http:'+codes[0] print 'uu', user_rate_url user_rate_html = urllib2.urlopen(user_rate_url) print user_rate_html.read() #title="4.78589分" desc_regex=u'title="(4.[0-9]{5}).*?' de_pat=re.compile(desc_regex) descs = re.findall(de_pat,user_rate_html.read()) print len(descs) item_url='https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail' #get_user_rate(item_url) '''获取卖家信用情况;未登录情况不能访问。。。暂时 无用''' def get_praised_good(url,file_open,keyword,counts,descripHrequ,servHrequ,descripNrequ): #从给定的淘宝链接中 获取符合条件的商品list html=urllib2.urlopen(post_request(url)) code=html.read() html.close() regrex2=ur'raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]' #每一个匹配项 返回 15个 字符串 #x[0]开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个 pat=re.compile(regrex2) meet_code=re.findall(regrex2,code)# for x in meet_code: if counter1.count>=counts : print "have get enough pruducts" break description_higher=int(x[10])*float(x[11])/100 service_higher=int(x[13])*float(x[14])/100 try: x4=int(x[4])#description_count except: x4=0 if (description_higher>=descripHrequ) and (service_higher>=servHrequ) and x4>=descripNrequ: if re.findall(keyword,x[0]) :# 中文keyword在结果中匹配问题暂时没有解决,,直接加在搜索词里吧 x0=x[0].replace(' ','').replace('/','') detail_url='http:'+x[2].decode('unicode-escape').encode('utf-8') x1='http:'+x[1].decode('unicode-escape').encode('utf-8') #print type(x) if detail_url in counter1.url_list: counter1.new_flag=False print 'no more new met products' print counter1.url_list print detail_url break counter1.url_list.append(detail_url) counter1.try_time+=1 counter1.count+=1 x11=float(x[11])/100 x9=float(x[9])/100 x12=float(x[12])/100 x6=float(x[6])/100 x3=float(x[3]) counter1.p+=x3 counter1.d+=x9 x5=unicode(x[5],'utf-8') result_list=[] result_list.append(x0) result_list.append(x1) result_list.append(detail_url) result_list.append(x3) result_list.append(x4) result_list.append(x5) result_list.append(x6) result_list.append(x9) result_list.append(x12) #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 counter1.results.append(result_list) def save_downpic(lis,file_open,savePath): '''从商品list下载图片到reserve_file_path,并写入信息至fileopen''' #0开始为 x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate len_list=len(lis) print len_list cc=0 for x in lis: if True: urllib.urlretrieve(x[1],savePath+'\\%s___'%cc +unicode(x[0],'utf-8')+'.jpg') txt_name=savePath+'\\'+ '%s__'%cc+ 'custome_description_%s __'%x[7] +'__comments_%s_'%x[4]+ '___price_%srmb___'%x[3] +x[5] +'.txt' file_o=open(txt_name,'a') file_o.write(x[2]) file_o.close() print '\nget_one_possible_fine_goods:\n','good_name:',x[0].decode('utf-8') print 'rate=',x[9] print 'price:',x[3],x[5]#.decode('utf-8') print 'custome_description:',x[7],'--','described_number:',x[4],' service:',x[8] print x[2].decode('utf-8'),'\ngood_pic_url:',x[1].decode('utf-8') print txt_name print cc+1,"th" file_open.write(u'%s__'%cc +u'%s'%x[0]+'\nprice:'+str(x[3])+'¥,'+'\n'+str(x[2])+' \n'+str(x[5])+'\ncustomer_description:'+str(x[7])+'described_number:'+str(x[4])+'\n\n\n') print 'get one -^-' # except : # print "failed to down picture or creat txt" # counter1.fail_time += 1 cc+=1 time.sleep(0.5) def get_all_praised_goods(reserch_goods,counts,savePath,price_min,price_max,descripHrequ,servHrequ ,descripNrequ): #边里搜索结果每一页 #initial url and page number initial_url='https://s.taobao.com/search?q='+reserch_goods+'&filter=reserve_price%5B'+'%s'%price_min+'%2C'+'%s'%price_max+'%5D&s=' print "initial_url",initial_url page_n=0 reserve_file=savePath+r'\found_goods.txt' file_open=open(reserve_file,'a') file_open.write('****************************\n') file_open.write(time.ctime()) file_open.write('\n****************************\n') while counter1.new_flag and counter1.count<counts : url_1=initial_url+'%s'%(44*page_n) #print initial_url print 'url_1:', url_1 #print 'ss',initial_url+'%s'%(44*page_n) page_n += 1 get_praised_good(url_1,file_open,keyword,counts,descripHrequ,servHrequ ,descripNrequ) time.sleep(2) # except: print "%s"%page_n,"pages have been searched" if page_n >=11 : print "check keyword,maybe too restrict" break print url_1 product_rank(counter1.results) counter1.results.sort(key=lambda x :x[9],reverse=True) save_downpic(counter1.results,file_open,savePath) # for a in counter1.results: for b in a : file_open.write(unicode(str(b),'utf-8')) file_open.write('\t') file_open.write('\n\n') file_open.close() counter1.print_counter() def input_para_inner(): reserch_goods='ssd120g' #淘宝搜索词 keyword='.'#raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 price_min=22 #价格区间 price_max=400 descripHrequ=0 # % 默认高于average, 输出结果大于此值 servHrequ=0 # % 默认高于average, 输出结果大于此值 descripNrequ=6 counts=10 #要求选出多少个商品 #savePath=r"C:\Users\Administrator\Desktop\python scrapy\find_worthy_goods\results"#结果保存路径 savePath=r"results"#结果保存路径 while os.path.exists(savePath): savePath =savePath+'%s'%random.randrange(0,100) if not os.path.exists(savePath): os.makedirs(savePath) get_all_praised_goods(reserch_goods,counts,savePath,price_min,price_max ,descripHrequ,servHrequ,descripNrequ) def input_para_byinterface(): print '说明:\n本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘(默认桌面新建find_worty_goods文件夹)并建立同序号开头的txt文件,图片显示商品,其旁的txt文件名显示价格等关键信息,txt里保存商品的淘宝链接'.decode('utf-8') print "please input reserch _goods_name" #print "请输入搜索商品名称;注意不能有空格,下同".decode('utf-8') reserch_goods=raw_input().replace(' ','') #淘宝搜索词 ,并去除中间意外输入的空格 if reserch_goods: # # # print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords" # try: # keyword=raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制 # except: # keyword='青' # # keyword='.' print "\nplease input _minimal price and _maximal price; \ndefault by 0,10000\nnext by 'enter'key input nothing means by default,the same below " print '请输入价格范围 ;默认0-10000 ;两项用半角逗号","分隔 按回车键确认;什么也不输入代表使用默认值 '.decode('utf-8') try: price_min, price_max=input() except: print 'not input or wrong number,use default range' price_min, price_max = 0 ,10000 # print "\nplease input _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ; \ndefault by 0,0 I.e better than average" print '请输入商品描述、服务高于平均值的百分比-100 ~100'.decode('utf-8') # % 默认高于average, 输出结果大于此值 try: descripHrequ,servHrequ=input() except: print 'not input or wrong number,use default range' descripHrequ = 0 # % 默认高于average, 输出结果大于此值 servHrequ = 0 # print "\nplease input description count limit, default more than 1" print '输入最低商品评价数,默认大于1'.decode('utf-8') try: descripNrequ=input() except : print 'not input or wrong number,use default range' descripNrequ=1 # print "\nIF customise file reserve path, Y or N \ndefault/sample as: C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results " print '是否自定义保存文件目录 Y or N'.decode('utf-8') if raw_input()=='Y': print "please input path that you want to reserve; \n " savePath = raw_input() else: savePath=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径 while os.path.exists(savePath): savePath = savePath+'%s'%random.randrange(1,10) #print "the path exist,we'll make a new one" try: os.makedirs(savePath) print 'ok,file_path we reserve results: %s'%savePath print '保存的路径为:'.decode('utf-8') except: print "failed to make file path\nplease restart program" print '创建文件夹失败,请重新启动程序'.decode('utf-8') # print "\nplease input how many results you want, default by 50" print '您要获取的商品数目,默认50'.decode('utf-8') try: counts=input() except : counts=50 # get_all_praised_goods(reserch_goods,counts,savePath,price_min,price_max,descripHrequ,servHrequ,decrip_N_req) print '\n' counter1.print_counter() print "finished,please look up in %s"%savePath print '下载完成'.decode('utf-8') print counter1.results input() else: print "no search goods" print '没有输入商品名称'.decode('utf-8') if __name__=="__main__": print '是否自定义' x= 1 keyword ='' if x==1: input_para_inner() else: input_para_byinterface() #保存图片,以文件名为商品图片名字,并以序号开头 #同时,输出 价格、商家名,商品描述、服务等 到 txt文本 #在商品图片看中后,便可按序号查找 #按描述、服务评价高于平均,购物体验应该可以的 View Code预计可添加功能:
交互界面
mysql的数据存储,以实现价格变动的比较