搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现

  • 主机环境:Ubuntu 13.04
  • Python版本:2.7.4
  • Django版本:1.5.4
  • Scrapy版本:0.18.2
  • ElasticSearch版本:0.90.5

闲来无聊,查看了相关搜索引擎的基本知识,经过搜集资料,了解了搜索引擎所需要的基本子系统,爬取子系统,索引服务子系统,Web请求和应答子系统。然后经过学习基本的开源框架文档,集成的项目已经PUSH到GitHub。

首先查看基于开源的Scrapy爬虫框架编写的一个爬虫,爬取校园网的内容(主要是免流量)

01 #!/usr/bin/env python
02 #-*- coding:utf-8 -*-
03 #from urlparse import urljoin
04 from scrapy.utils.url import urljoin_rfc
05 from scrapy.spider import BaseSpider
06 from scrapy.selector import HtmlXPathSelector
07 from scrapy.http import Request
08
09 from scrapy.exceptions import DropItem
10
11 from mymodules.items import Website
12
13 import urllib
14 import re
15
16 class Xidian_Spider(BaseSpider):
17     name = "xidian_spider"
18     start_urls = [
19        "http://www.xidian.edu.cn",
20        #"http://rs.xidian.edu.cn/forum.php",
21
22     ]
23
24     def __init__(self):
25         """init the allowed_domain"""
26         self.allowed_domains = ['xidian.edu.cn']
27
28     def parse(self, response):
29         """In this parse,we use double yeild to return the item or Request"""
30         hxs = HtmlXPathSelector(response)
31
32         refer_websites = hxs.select('//@href').extract()
33
34         #if not self.gethostname(response.url) in self.allowed_domains:
35         #    self.allowed_domains.append(self.gethostname(response.url))
36
37         item = Website()
38         item['url'= response.url
39         item['title'= hxs.select('/html/head/title/text()').extract()[0]
40
41         """FIXME:This XPath select all the elements,include the javascript code.BAD!!"""
42         str = ''
43         list = hxs.select('/html/body//*/text()').extract()
44         for in list:
45             str += s.strip()
46             str += ' '
47
48         item['content'= str
49
50         yield item
51
52         for weburl in refer_websites:
53
54             utf8_url = weburl.encode('utf-8')
55
56             """The following regex to match the prefix and postfix of urls"""
57             postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$')
58             prefix = re.compile(r'^((javascript:)|(openapi)).+')
59
60             if postfix.match(utf8_url):
61                 continue
62             if prefix.match(utf8_url):
63                 continue
64             if not utf8_url.startswith('http://'):
65                 #weburl = urljoin_rfc(response.url, weburl, response.encoding)
66                 weburl = 'http://'+self.gethostname(response.url)+'/'+weburl
67
68             weburl = re.sub(r'/\.\./\.\./',r'/',weburl)
69             weburl = re.sub(r'/\.\./',r'/',weburl)
70
71             yield Request(weburl, callback=self.parse)
72
73     def gethostname(self, res_url):
74         """get the host name of a url"""
75         proto, rest = urllib.splittype(res_url)
76         host, rest = urllib.splithost(rest)
77         return host

爬取得到的ITEM会交给PIPELINE处理。

这里的PipeLine做了去重处理,不能简单的放在内容,所以使用的是Bloom Filter的算法,这里直接安装了Python的开源库中的pybloomfilter(有时间研究一下)

01 class DuplicatesPipeline(object):
02
03     def __init__(self):
04         self.bf = BloomFilter(100000000.01'filter.bloom')
05         self.f_write = open('visitedsites','w')
06         self.si = SearchIndex()
07         self.si.SearchInit()
08
09     def process_item(self, item, spider):
10         print '************%d pages visited!*****************' %len(self.bf)
11         if self.bf.add(item['url']):#True if item in the BF
12             raise DropItem("Duplicate item found: %s" % item)
13         else:
14             #print '%d pages visited!'% len(self.url_seen)
15             self.save_to_file(item['url'],item['title'])
16             self.si.AddIndex(item)
17             return item
18
19     def save_to_file(self,url,utitle):
20         self.f_write.write(url)
21         self.f_write.write('\t')
22         self.f_write.write(utitle.encode('utf-8'))
23         self.f_write.write('\n')
24
25     def __del__(self):
26         """docstring for __del__"""
27         self.f_write.close()
28         self.si.IndexDone()

该类中的SearchIndex是ElasticSearch建立索引的类。定义如下:

01 #!/usr/bin/env python
02 #-*- coding:utf-8-*-
03 import os
04 import sys
05 from pyes import *
06 from mymodules.items import Website
07 INDEX_NAME='xidian_spider'
08
09 class SearchIndex(object):
10
11     def SearchInit(self):
12         self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES
13         try:
14             self.conn.delete_index(INDEX_NAME)
15             #pass
16         except:
17             pass
18         self.conn.create_index(INDEX_NAME)#Create a new INDEX
19
20         #Define the structure of the data format
21         mapping = {u'content': {'boost'1.0,
22                           'index''analyzed',
23                           'store''yes',
24                           'type': u'string',
25                           "indexAnalyzer":"ik",
26                           "searchAnalyzer":"ik",
27                           "term_vector" "with_positions_offsets"},
28                   u'title': {'boost'1.0,
29                              'index''analyzed',
30                              'store''yes',
31                              'type': u'string',
32                              "indexAnalyzer":"ik",
33                              "searchAnalyzer":"ik",
34                              "term_vector" "with_positions_offsets"},
35                   u'url': {'boost'1.0,
36                              'index''analyzed',
37                              'store''yes',
38                              'type': u'string',
39                              #"indexAnalyzer":"ik",
40                              #"searchAnalyzer":"ik",
41                              "term_vector" "with_positions_offsets"},
42         }
43
44         self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type
45
46     def AddIndex(self,item):
47
48         print 'Adding Index item URL %s'% item['title'].encode('utf-8')
49         self.conn.index({'title':item['title'].encode('utf-8'), \
50                 'url':item['url'].encode('utf-8'),\
51                 'content':item['content'].encode('utf-8')\
52                 },INDEX_NAME,'searchEngine-type')
53
54     def IndexDone(self):
55         self.conn.default_indices=[INDEX_NAME]#Set the default indices
56         self.conn.refresh()#Refresh the ES

其中中文分词使用的是IK分词,Python库中直接安装即可。

Django中接受搜索请求的处理函数如下:

01 def search(request):
02     """docstring for search"""
03     if 'q' in request.GET:
04         = request.GET['q']
05         print q
06         if 'page' in request.GET:
07             page = unicode(request.GET['page'])
08         else:
09             page = unicode(1)
10         start = clock()
11         results = dosearch(q,page)#connect to ES to return the results
12         end = clock()
13         return render(request,'res_search.html', {'results' : results,
14                                                     'query':q,
15                                                     'count':len(results),
16                                                     'time':end-start,
17                                                     'page':page,
18                                                     'nextpage':int(page)+1})
19     else:
20         message = 'You submitted an empty form.'
21         return HttpResponse(message)

其中调用dosearch函数进行连接ES查询,函数内容如下:

01 def dosearch(string,upage):
02     conn = ES('127.0.0.1:9200', timeout=3.5)#连接ES
03     fq_title = FieldQuery(analyzer='ik')
04     fq_title.add('title',string)
05
06     fq_content = FieldQuery(analyzer='ik')
07     fq_content.add('content',string)
08
09     bq = BoolQuery(should=[fq_title,fq_content])
10
11     h=HighLighter(['['], [']'], fragment_size=100)
12
13     page = int(upage.encode('utf-8'))
14     if page < 1:
15         page = 1
16
17     s=Search(bq,highlight=h,start=(page-1)*PAGE_SIZE,size=PAGE_SIZE)
18     s.add_highlight("content")
19     s.add_highlight('title')
20     results=conn.search(s,indices='xidian_spider',doc_types='searchEngine-type')
21
22     list=[]
23     for in results:
24         if(r._meta.highlight.has_key("title")):
25             r['title']=r._meta.highlight[u"title"][0]
26         if(r._meta.highlight.has_key('content')):
27             r['content']=r._meta.highlight[u'content'][0]
28
29         res = Results()
30         res.content = r['content']
31         res.title = r['title']
32         res.url = r['url']
33         list.append(res)
34     return list

标签