本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

106 lines
2.7KB

  1. // Copyright (c) 2014 Couchbase, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // Package lowercase implements a TokenFilter which converts
  15. // tokens to lower case according to unicode rules.
  16. package lowercase
  17. import (
  18. "bytes"
  19. "unicode"
  20. "unicode/utf8"
  21. "github.com/blevesearch/bleve/analysis"
  22. "github.com/blevesearch/bleve/registry"
  23. )
  24. // Name is the name used to register LowerCaseFilter in the bleve registry
  25. const Name = "to_lower"
  26. type LowerCaseFilter struct {
  27. }
  28. func NewLowerCaseFilter() *LowerCaseFilter {
  29. return &LowerCaseFilter{}
  30. }
  31. func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
  32. for _, token := range input {
  33. token.Term = toLowerDeferredCopy(token.Term)
  34. }
  35. return input
  36. }
  37. func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
  38. return NewLowerCaseFilter(), nil
  39. }
  40. func init() {
  41. registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
  42. }
  43. // toLowerDeferredCopy will function exactly like
  44. // bytes.ToLower() only it will reuse (overwrite)
  45. // the original byte array when possible
  46. // NOTE: because its possible that the lower-case
  47. // form of a rune has a different utf-8 encoded
  48. // length, in these cases a new byte array is allocated
  49. func toLowerDeferredCopy(s []byte) []byte {
  50. j := 0
  51. for i := 0; i < len(s); {
  52. wid := 1
  53. r := rune(s[i])
  54. if r >= utf8.RuneSelf {
  55. r, wid = utf8.DecodeRune(s[i:])
  56. }
  57. l := unicode.ToLower(r)
  58. // If the rune is already lowercased, just move to the
  59. // next rune.
  60. if l == r {
  61. i += wid
  62. j += wid
  63. continue
  64. }
  65. // Handles the Unicode edge-case where the last
  66. // rune in a word on the greek Σ needs to be converted
  67. // differently.
  68. if l == 'σ' && i+2 == len(s) {
  69. l = 'ς'
  70. }
  71. lwid := utf8.RuneLen(l)
  72. if lwid > wid {
  73. // utf-8 encoded replacement is wider
  74. // for now, punt and defer
  75. // to bytes.ToLower() for the remainder
  76. // only known to happen with chars
  77. // Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
  78. // Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
  79. rest := bytes.ToLower(s[i:])
  80. rv := make([]byte, j+len(rest))
  81. copy(rv[:j], s[:j])
  82. copy(rv[j:], rest)
  83. return rv
  84. } else {
  85. utf8.EncodeRune(s[j:], l)
  86. }
  87. i += wid
  88. j += lwid
  89. }
  90. return s[:j]
  91. }
上海开阖软件有限公司 沪ICP备12045867号-1