本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

438 lines
12KB

  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package searcher
  15. import (
  16. "fmt"
  17. "math"
  18. "reflect"
  19. "github.com/blevesearch/bleve/index"
  20. "github.com/blevesearch/bleve/search"
  21. "github.com/blevesearch/bleve/size"
  22. )
  23. var reflectStaticSizePhraseSearcher int
  24. func init() {
  25. var ps PhraseSearcher
  26. reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size())
  27. }
  28. type PhraseSearcher struct {
  29. mustSearcher search.Searcher
  30. queryNorm float64
  31. currMust *search.DocumentMatch
  32. terms [][]string
  33. path phrasePath
  34. paths []phrasePath
  35. locations []search.Location
  36. initialized bool
  37. }
  38. func (s *PhraseSearcher) Size() int {
  39. sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr
  40. if s.mustSearcher != nil {
  41. sizeInBytes += s.mustSearcher.Size()
  42. }
  43. if s.currMust != nil {
  44. sizeInBytes += s.currMust.Size()
  45. }
  46. for _, entry := range s.terms {
  47. sizeInBytes += size.SizeOfSlice
  48. for _, entry1 := range entry {
  49. sizeInBytes += size.SizeOfString + len(entry1)
  50. }
  51. }
  52. return sizeInBytes
  53. }
  54. func NewPhraseSearcher(indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
  55. // turn flat terms []string into [][]string
  56. mterms := make([][]string, len(terms))
  57. for i, term := range terms {
  58. mterms[i] = []string{term}
  59. }
  60. return NewMultiPhraseSearcher(indexReader, mterms, field, options)
  61. }
  62. func NewMultiPhraseSearcher(indexReader index.IndexReader, terms [][]string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
  63. options.IncludeTermVectors = true
  64. var termPositionSearchers []search.Searcher
  65. for _, termPos := range terms {
  66. if len(termPos) == 1 && termPos[0] != "" {
  67. // single term
  68. ts, err := NewTermSearcher(indexReader, termPos[0], field, 1.0, options)
  69. if err != nil {
  70. // close any searchers already opened
  71. for _, ts := range termPositionSearchers {
  72. _ = ts.Close()
  73. }
  74. return nil, fmt.Errorf("phrase searcher error building term searcher: %v", err)
  75. }
  76. termPositionSearchers = append(termPositionSearchers, ts)
  77. } else if len(termPos) > 1 {
  78. // multiple terms
  79. var termSearchers []search.Searcher
  80. for _, term := range termPos {
  81. if term == "" {
  82. continue
  83. }
  84. ts, err := NewTermSearcher(indexReader, term, field, 1.0, options)
  85. if err != nil {
  86. // close any searchers already opened
  87. for _, ts := range termPositionSearchers {
  88. _ = ts.Close()
  89. }
  90. return nil, fmt.Errorf("phrase searcher error building term searcher: %v", err)
  91. }
  92. termSearchers = append(termSearchers, ts)
  93. }
  94. disjunction, err := NewDisjunctionSearcher(indexReader, termSearchers, 1, options)
  95. if err != nil {
  96. // close any searchers already opened
  97. for _, ts := range termPositionSearchers {
  98. _ = ts.Close()
  99. }
  100. return nil, fmt.Errorf("phrase searcher error building term position disjunction searcher: %v", err)
  101. }
  102. termPositionSearchers = append(termPositionSearchers, disjunction)
  103. }
  104. }
  105. mustSearcher, err := NewConjunctionSearcher(indexReader, termPositionSearchers, options)
  106. if err != nil {
  107. // close any searchers already opened
  108. for _, ts := range termPositionSearchers {
  109. _ = ts.Close()
  110. }
  111. return nil, fmt.Errorf("phrase searcher error building conjunction searcher: %v", err)
  112. }
  113. // build our searcher
  114. rv := PhraseSearcher{
  115. mustSearcher: mustSearcher,
  116. terms: terms,
  117. }
  118. rv.computeQueryNorm()
  119. return &rv, nil
  120. }
  121. func (s *PhraseSearcher) computeQueryNorm() {
  122. // first calculate sum of squared weights
  123. sumOfSquaredWeights := 0.0
  124. if s.mustSearcher != nil {
  125. sumOfSquaredWeights += s.mustSearcher.Weight()
  126. }
  127. // now compute query norm from this
  128. s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
  129. // finally tell all the downstream searchers the norm
  130. if s.mustSearcher != nil {
  131. s.mustSearcher.SetQueryNorm(s.queryNorm)
  132. }
  133. }
  134. func (s *PhraseSearcher) initSearchers(ctx *search.SearchContext) error {
  135. err := s.advanceNextMust(ctx)
  136. if err != nil {
  137. return err
  138. }
  139. s.initialized = true
  140. return nil
  141. }
  142. func (s *PhraseSearcher) advanceNextMust(ctx *search.SearchContext) error {
  143. var err error
  144. if s.mustSearcher != nil {
  145. if s.currMust != nil {
  146. ctx.DocumentMatchPool.Put(s.currMust)
  147. }
  148. s.currMust, err = s.mustSearcher.Next(ctx)
  149. if err != nil {
  150. return err
  151. }
  152. }
  153. return nil
  154. }
  155. func (s *PhraseSearcher) Weight() float64 {
  156. return s.mustSearcher.Weight()
  157. }
  158. func (s *PhraseSearcher) SetQueryNorm(qnorm float64) {
  159. s.mustSearcher.SetQueryNorm(qnorm)
  160. }
  161. func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
  162. if !s.initialized {
  163. err := s.initSearchers(ctx)
  164. if err != nil {
  165. return nil, err
  166. }
  167. }
  168. for s.currMust != nil {
  169. // check this match against phrase constraints
  170. rv := s.checkCurrMustMatch(ctx)
  171. // prepare for next iteration (either loop or subsequent call to Next())
  172. err := s.advanceNextMust(ctx)
  173. if err != nil {
  174. return nil, err
  175. }
  176. // if match satisfied phrase constraints return it as a hit
  177. if rv != nil {
  178. return rv, nil
  179. }
  180. }
  181. return nil, nil
  182. }
  183. // checkCurrMustMatch is solely concerned with determining if the DocumentMatch
  184. // pointed to by s.currMust (which satisifies the pre-condition searcher)
  185. // also satisfies the phase constraints. if so, it returns a DocumentMatch
  186. // for this document, otherwise nil
  187. func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch {
  188. s.locations = s.currMust.Complete(s.locations)
  189. locations := s.currMust.Locations
  190. s.currMust.Locations = nil
  191. ftls := s.currMust.FieldTermLocations
  192. // typically we would expect there to only actually be results in
  193. // one field, but we allow for this to not be the case
  194. // but, we note that phrase constraints can only be satisfied within
  195. // a single field, so we can check them each independently
  196. for field, tlm := range locations {
  197. ftls = s.checkCurrMustMatchField(ctx, field, tlm, ftls)
  198. }
  199. if len(ftls) > 0 {
  200. // return match
  201. rv := s.currMust
  202. s.currMust = nil
  203. rv.FieldTermLocations = ftls
  204. return rv
  205. }
  206. return nil
  207. }
  208. // checkCurrMustMatchField is solely concerned with determining if one
  209. // particular field within the currMust DocumentMatch Locations
  210. // satisfies the phase constraints (possibly more than once). if so,
  211. // the matching field term locations are appended to the provided
  212. // slice
  213. func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
  214. field string, tlm search.TermLocationMap,
  215. ftls []search.FieldTermLocation) []search.FieldTermLocation {
  216. if s.path == nil {
  217. s.path = make(phrasePath, 0, len(s.terms))
  218. }
  219. s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0])
  220. for _, p := range s.paths {
  221. for _, pp := range p {
  222. ftls = append(ftls, search.FieldTermLocation{
  223. Field: field,
  224. Term: pp.term,
  225. Location: search.Location{
  226. Pos: pp.loc.Pos,
  227. Start: pp.loc.Start,
  228. End: pp.loc.End,
  229. ArrayPositions: pp.loc.ArrayPositions,
  230. },
  231. })
  232. }
  233. }
  234. return ftls
  235. }
  236. type phrasePart struct {
  237. term string
  238. loc *search.Location
  239. }
  240. func (p *phrasePart) String() string {
  241. return fmt.Sprintf("[%s %v]", p.term, p.loc)
  242. }
  243. type phrasePath []phrasePart
  244. func (p phrasePath) MergeInto(in search.TermLocationMap) {
  245. for _, pp := range p {
  246. in[pp.term] = append(in[pp.term], pp.loc)
  247. }
  248. }
  249. func (p phrasePath) String() string {
  250. rv := "["
  251. for i, pp := range p {
  252. if i > 0 {
  253. rv += ", "
  254. }
  255. rv += pp.String()
  256. }
  257. rv += "]"
  258. return rv
  259. }
  260. // findPhrasePaths is a function to identify phase matches from a set
  261. // of known term locations. it recursive so care must be taken with
  262. // arguments and return values.
  263. //
  264. // prevPos - the previous location, 0 on first invocation
  265. // ap - array positions of the first candidate phrase part to
  266. // which further recursive phrase parts must match,
  267. // nil on initial invocation or when there are no array positions
  268. // phraseTerms - slice containing the phrase terms,
  269. // may contain empty string as placeholder (don't care)
  270. // tlm - the Term Location Map containing all relevant term locations
  271. // p - the current path being explored (appended to in recursive calls)
  272. // this is the primary state being built during the traversal
  273. // remainingSlop - amount of sloppiness that's allowed, which is the
  274. // sum of the editDistances from each matching phrase part,
  275. // where 0 means no sloppiness allowed (all editDistances must be 0),
  276. // decremented during recursion
  277. // rv - the final result being appended to by all the recursive calls
  278. //
  279. // returns slice of paths, or nil if invocation did not find any successul paths
  280. func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string,
  281. tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath {
  282. // no more terms
  283. if len(phraseTerms) < 1 {
  284. // snapshot or copy the recursively built phrasePath p and
  285. // append it to the rv, also optimizing by checking if next
  286. // phrasePath item in the rv (which we're about to overwrite)
  287. // is available for reuse
  288. var pcopy phrasePath
  289. if len(rv) < cap(rv) {
  290. pcopy = rv[:len(rv)+1][len(rv)][:0]
  291. }
  292. return append(rv, append(pcopy, p...))
  293. }
  294. car := phraseTerms[0]
  295. cdr := phraseTerms[1:]
  296. // empty term is treated as match (continue)
  297. if len(car) == 0 || (len(car) == 1 && car[0] == "") {
  298. nextPos := prevPos + 1
  299. if prevPos == 0 {
  300. // if prevPos was 0, don't set it to 1 (as thats not a real abs pos)
  301. nextPos = 0 // don't advance nextPos if prevPos was 0
  302. }
  303. return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop, rv)
  304. }
  305. // locations for this term
  306. for _, carTerm := range car {
  307. locations := tlm[carTerm]
  308. LOCATIONS_LOOP:
  309. for _, loc := range locations {
  310. if prevPos != 0 && !loc.ArrayPositions.Equals(ap) {
  311. // if the array positions are wrong, can't match, try next location
  312. continue
  313. }
  314. // compute distance from previous phrase term
  315. dist := 0
  316. if prevPos != 0 {
  317. dist = editDistance(prevPos+1, loc.Pos)
  318. }
  319. // if enough slop remaining, continue recursively
  320. if prevPos == 0 || (remainingSlop-dist) >= 0 {
  321. // skip if we've already used this term+loc already
  322. for _, ppart := range p {
  323. if ppart.term == carTerm && ppart.loc == loc {
  324. continue LOCATIONS_LOOP
  325. }
  326. }
  327. // this location works, add it to the path (but not for empty term)
  328. px := append(p, phrasePart{term: carTerm, loc: loc})
  329. rv = findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv)
  330. }
  331. }
  332. }
  333. return rv
  334. }
  335. func editDistance(p1, p2 uint64) int {
  336. dist := int(p1 - p2)
  337. if dist < 0 {
  338. return -dist
  339. }
  340. return dist
  341. }
  342. func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) {
  343. if !s.initialized {
  344. err := s.initSearchers(ctx)
  345. if err != nil {
  346. return nil, err
  347. }
  348. }
  349. if s.currMust != nil {
  350. if s.currMust.IndexInternalID.Compare(ID) >= 0 {
  351. return s.Next(ctx)
  352. }
  353. ctx.DocumentMatchPool.Put(s.currMust)
  354. }
  355. if s.currMust == nil {
  356. return nil, nil
  357. }
  358. var err error
  359. s.currMust, err = s.mustSearcher.Advance(ctx, ID)
  360. if err != nil {
  361. return nil, err
  362. }
  363. return s.Next(ctx)
  364. }
  365. func (s *PhraseSearcher) Count() uint64 {
  366. // for now return a worst case
  367. return s.mustSearcher.Count()
  368. }
  369. func (s *PhraseSearcher) Close() error {
  370. if s.mustSearcher != nil {
  371. err := s.mustSearcher.Close()
  372. if err != nil {
  373. return err
  374. }
  375. }
  376. return nil
  377. }
  378. func (s *PhraseSearcher) Min() int {
  379. return 0
  380. }
  381. func (s *PhraseSearcher) DocumentMatchPoolSize() int {
  382. return s.mustSearcher.DocumentMatchPoolSize() + 1
  383. }
上海开阖软件有限公司 沪ICP备12045867号-1