本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

132 lines
3.0KB

  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package unicode
  15. import (
  16. "github.com/blevesearch/segment"
  17. "github.com/blevesearch/bleve/analysis"
  18. "github.com/blevesearch/bleve/registry"
  19. )
  20. const Name = "unicode"
  21. type UnicodeTokenizer struct {
  22. }
  23. func NewUnicodeTokenizer() *UnicodeTokenizer {
  24. return &UnicodeTokenizer{}
  25. }
  26. func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
  27. rvx := make([]analysis.TokenStream, 0, 10) // When rv gets full, append to rvx.
  28. rv := make(analysis.TokenStream, 0, 1)
  29. ta := []analysis.Token(nil)
  30. taNext := 0
  31. segmenter := segment.NewWordSegmenterDirect(input)
  32. start := 0
  33. pos := 1
  34. guessRemaining := func(end int) int {
  35. avgSegmentLen := end / (len(rv) + 1)
  36. if avgSegmentLen < 1 {
  37. avgSegmentLen = 1
  38. }
  39. remainingLen := len(input) - end
  40. return remainingLen / avgSegmentLen
  41. }
  42. for segmenter.Segment() {
  43. segmentBytes := segmenter.Bytes()
  44. end := start + len(segmentBytes)
  45. if segmenter.Type() != segment.None {
  46. if taNext >= len(ta) {
  47. remainingSegments := guessRemaining(end)
  48. if remainingSegments > 1000 {
  49. remainingSegments = 1000
  50. }
  51. if remainingSegments < 1 {
  52. remainingSegments = 1
  53. }
  54. ta = make([]analysis.Token, remainingSegments)
  55. taNext = 0
  56. }
  57. token := &ta[taNext]
  58. taNext++
  59. token.Term = segmentBytes
  60. token.Start = start
  61. token.End = end
  62. token.Position = pos
  63. token.Type = convertType(segmenter.Type())
  64. if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
  65. rvx = append(rvx, rv)
  66. rvCap := cap(rv) * 2
  67. if rvCap > 256 {
  68. rvCap = 256
  69. }
  70. rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
  71. }
  72. rv = append(rv, token)
  73. pos++
  74. }
  75. start = end
  76. }
  77. if len(rvx) > 0 {
  78. n := len(rv)
  79. for _, r := range rvx {
  80. n += len(r)
  81. }
  82. rall := make(analysis.TokenStream, 0, n)
  83. for _, r := range rvx {
  84. rall = append(rall, r...)
  85. }
  86. return append(rall, rv...)
  87. }
  88. return rv
  89. }
  90. func UnicodeTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
  91. return NewUnicodeTokenizer(), nil
  92. }
  93. func init() {
  94. registry.RegisterTokenizer(Name, UnicodeTokenizerConstructor)
  95. }
  96. func convertType(segmentWordType int) analysis.TokenType {
  97. switch segmentWordType {
  98. case segment.Ideo:
  99. return analysis.Ideographic
  100. case segment.Kana:
  101. return analysis.Ideographic
  102. case segment.Number:
  103. return analysis.Numeric
  104. }
  105. return analysis.AlphaNumeric
  106. }
上海开阖软件有限公司 沪ICP备12045867号-1