本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

362 lines
9.8KB

  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package index
  15. import (
  16. "bytes"
  17. "encoding/json"
  18. "fmt"
  19. "reflect"
  20. "github.com/blevesearch/bleve/document"
  21. "github.com/blevesearch/bleve/index/store"
  22. "github.com/blevesearch/bleve/size"
  23. )
  24. var reflectStaticSizeTermFieldDoc int
  25. var reflectStaticSizeTermFieldVector int
  26. func init() {
  27. var tfd TermFieldDoc
  28. reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
  29. var tfv TermFieldVector
  30. reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
  31. }
  32. var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
  33. type Index interface {
  34. Open() error
  35. Close() error
  36. Update(doc *document.Document) error
  37. Delete(id string) error
  38. Batch(batch *Batch) error
  39. SetInternal(key, val []byte) error
  40. DeleteInternal(key []byte) error
  41. // Reader returns a low-level accessor on the index data. Close it to
  42. // release associated resources.
  43. Reader() (IndexReader, error)
  44. Stats() json.Marshaler
  45. StatsMap() map[string]interface{}
  46. Analyze(d *document.Document) *AnalysisResult
  47. Advanced() (store.KVStore, error)
  48. }
  49. type DocumentFieldTermVisitor func(field string, term []byte)
  50. type IndexReader interface {
  51. TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
  52. // DocIDReader returns an iterator over all doc ids
  53. // The caller must close returned instance to release associated resources.
  54. DocIDReaderAll() (DocIDReader, error)
  55. DocIDReaderOnly(ids []string) (DocIDReader, error)
  56. FieldDict(field string) (FieldDict, error)
  57. // FieldDictRange is currently defined to include the start and end terms
  58. FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error)
  59. FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error)
  60. Document(id string) (*document.Document, error)
  61. DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
  62. DocValueReader(fields []string) (DocValueReader, error)
  63. Fields() ([]string, error)
  64. GetInternal(key []byte) ([]byte, error)
  65. DocCount() (uint64, error)
  66. ExternalID(id IndexInternalID) (string, error)
  67. InternalID(id string) (IndexInternalID, error)
  68. DumpAll() chan interface{}
  69. DumpDoc(id string) chan interface{}
  70. DumpFields() chan interface{}
  71. Close() error
  72. }
  73. // The Regexp interface defines the subset of the regexp.Regexp API
  74. // methods that are used by bleve indexes, allowing callers to pass in
  75. // alternate implementations.
  76. type Regexp interface {
  77. FindStringIndex(s string) (loc []int)
  78. LiteralPrefix() (prefix string, complete bool)
  79. String() string
  80. }
  81. type IndexReaderRegexp interface {
  82. FieldDictRegexp(field string, regex string) (FieldDict, error)
  83. }
  84. type IndexReaderFuzzy interface {
  85. FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
  86. }
  87. type IndexReaderOnly interface {
  88. FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
  89. }
  90. // FieldTerms contains the terms used by a document, keyed by field
  91. type FieldTerms map[string][]string
  92. // FieldsNotYetCached returns a list of fields not yet cached out of a larger list of fields
  93. func (f FieldTerms) FieldsNotYetCached(fields []string) []string {
  94. rv := make([]string, 0, len(fields))
  95. for _, field := range fields {
  96. if _, ok := f[field]; !ok {
  97. rv = append(rv, field)
  98. }
  99. }
  100. return rv
  101. }
  102. // Merge will combine two FieldTerms
  103. // it assumes that the terms lists are complete (thus do not need to be merged)
  104. // field terms from the other list always replace the ones in the receiver
  105. func (f FieldTerms) Merge(other FieldTerms) {
  106. for field, terms := range other {
  107. f[field] = terms
  108. }
  109. }
  110. type TermFieldVector struct {
  111. Field string
  112. ArrayPositions []uint64
  113. Pos uint64
  114. Start uint64
  115. End uint64
  116. }
  117. func (tfv *TermFieldVector) Size() int {
  118. return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
  119. len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
  120. }
  121. // IndexInternalID is an opaque document identifier interal to the index impl
  122. type IndexInternalID []byte
  123. func (id IndexInternalID) Equals(other IndexInternalID) bool {
  124. return id.Compare(other) == 0
  125. }
  126. func (id IndexInternalID) Compare(other IndexInternalID) int {
  127. return bytes.Compare(id, other)
  128. }
  129. type TermFieldDoc struct {
  130. Term string
  131. ID IndexInternalID
  132. Freq uint64
  133. Norm float64
  134. Vectors []*TermFieldVector
  135. }
  136. func (tfd *TermFieldDoc) Size() int {
  137. sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
  138. len(tfd.Term) + len(tfd.ID)
  139. for _, entry := range tfd.Vectors {
  140. sizeInBytes += entry.Size()
  141. }
  142. return sizeInBytes
  143. }
  144. // Reset allows an already allocated TermFieldDoc to be reused
  145. func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
  146. // remember the []byte used for the ID
  147. id := tfd.ID
  148. vectors := tfd.Vectors
  149. // idiom to copy over from empty TermFieldDoc (0 allocations)
  150. *tfd = TermFieldDoc{}
  151. // reuse the []byte already allocated (and reset len to 0)
  152. tfd.ID = id[:0]
  153. tfd.Vectors = vectors[:0]
  154. return tfd
  155. }
  156. // TermFieldReader is the interface exposing the enumeration of documents
  157. // containing a given term in a given field. Documents are returned in byte
  158. // lexicographic order over their identifiers.
  159. type TermFieldReader interface {
  160. // Next returns the next document containing the term in this field, or nil
  161. // when it reaches the end of the enumeration. The preAlloced TermFieldDoc
  162. // is optional, and when non-nil, will be used instead of allocating memory.
  163. Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error)
  164. // Advance resets the enumeration at specified document or its immediate
  165. // follower.
  166. Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error)
  167. // Count returns the number of documents contains the term in this field.
  168. Count() uint64
  169. Close() error
  170. Size() int
  171. }
  172. type DictEntry struct {
  173. Term string
  174. Count uint64
  175. }
  176. type FieldDict interface {
  177. Next() (*DictEntry, error)
  178. Close() error
  179. }
  180. // DocIDReader is the interface exposing enumeration of documents identifiers.
  181. // Close the reader to release associated resources.
  182. type DocIDReader interface {
  183. // Next returns the next document internal identifier in the natural
  184. // index order, nil when the end of the sequence is reached.
  185. Next() (IndexInternalID, error)
  186. // Advance resets the iteration to the first internal identifier greater than
  187. // or equal to ID. If ID is smaller than the start of the range, the iteration
  188. // will start there instead. If ID is greater than or equal to the end of
  189. // the range, Next() call will return io.EOF.
  190. Advance(ID IndexInternalID) (IndexInternalID, error)
  191. Size() int
  192. Close() error
  193. }
  194. type BatchCallback func(error)
  195. type Batch struct {
  196. IndexOps map[string]*document.Document
  197. InternalOps map[string][]byte
  198. persistedCallback BatchCallback
  199. }
  200. func NewBatch() *Batch {
  201. return &Batch{
  202. IndexOps: make(map[string]*document.Document),
  203. InternalOps: make(map[string][]byte),
  204. }
  205. }
  206. func (b *Batch) Update(doc *document.Document) {
  207. b.IndexOps[doc.ID] = doc
  208. }
  209. func (b *Batch) Delete(id string) {
  210. b.IndexOps[id] = nil
  211. }
  212. func (b *Batch) SetInternal(key, val []byte) {
  213. b.InternalOps[string(key)] = val
  214. }
  215. func (b *Batch) DeleteInternal(key []byte) {
  216. b.InternalOps[string(key)] = nil
  217. }
  218. func (b *Batch) SetPersistedCallback(f BatchCallback) {
  219. b.persistedCallback = f
  220. }
  221. func (b *Batch) PersistedCallback() BatchCallback {
  222. return b.persistedCallback
  223. }
  224. func (b *Batch) String() string {
  225. rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
  226. for k, v := range b.IndexOps {
  227. if v != nil {
  228. rv += fmt.Sprintf("\tINDEX - '%s'\n", k)
  229. } else {
  230. rv += fmt.Sprintf("\tDELETE - '%s'\n", k)
  231. }
  232. }
  233. for k, v := range b.InternalOps {
  234. if v != nil {
  235. rv += fmt.Sprintf("\tSET INTERNAL - '%s'\n", k)
  236. } else {
  237. rv += fmt.Sprintf("\tDELETE INTERNAL - '%s'\n", k)
  238. }
  239. }
  240. return rv
  241. }
  242. func (b *Batch) Reset() {
  243. b.IndexOps = make(map[string]*document.Document)
  244. b.InternalOps = make(map[string][]byte)
  245. b.persistedCallback = nil
  246. }
  247. func (b *Batch) Merge(o *Batch) {
  248. for k, v := range o.IndexOps {
  249. b.IndexOps[k] = v
  250. }
  251. for k, v := range o.InternalOps {
  252. b.InternalOps[k] = v
  253. }
  254. }
  255. func (b *Batch) TotalDocSize() int {
  256. var s int
  257. for k, v := range b.IndexOps {
  258. if v != nil {
  259. s += v.Size() + size.SizeOfString
  260. }
  261. s += len(k)
  262. }
  263. return s
  264. }
  265. // Optimizable represents an optional interface that implementable by
  266. // optimizable resources (e.g., TermFieldReaders, Searchers). These
  267. // optimizable resources are provided the same OptimizableContext
  268. // instance, so that they can coordinate via dynamic interface
  269. // casting.
  270. type Optimizable interface {
  271. Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
  272. }
  273. // Represents a result of optimization -- see the Finish() method.
  274. type Optimized interface{}
  275. type OptimizableContext interface {
  276. // Once all the optimzable resources have been provided the same
  277. // OptimizableContext instance, the optimization preparations are
  278. // finished or completed via the Finish() method.
  279. //
  280. // Depending on the optimization being performed, the Finish()
  281. // method might return a non-nil Optimized instance. For example,
  282. // the Optimized instance might represent an optimized
  283. // TermFieldReader instance.
  284. Finish() (Optimized, error)
  285. }
  286. type DocValueReader interface {
  287. VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
  288. }
上海开阖软件有限公司 沪ICP备12045867号-1