|
- // Copyright (c) 2015 Couchbase, Inc.
- // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
- // except in compliance with the License. You may obtain a copy of the License at
- // http://www.apache.org/licenses/LICENSE-2.0
- // Unless required by applicable law or agreed to in writing, software distributed under the
- // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
- // either express or implied. See the License for the specific language governing permissions
- // and limitations under the License.
-
- // +build BUILDTAGS
-
- package segment
-
- import (
- "fmt"
- "unicode/utf8"
- )
-
- var RagelFlags = "RAGELFLAGS"
-
- var ParseError = fmt.Errorf("unicode word segmentation parse error")
-
- // Word Types
- const (
- None = iota
- Number
- Letter
- Kana
- Ideo
- )
-
- %%{
- machine s;
- write data;
- }%%
-
- func segmentWords(data []byte, maxTokens int, atEOF bool, val [][]byte, types []int) ([][]byte, []int, int, error) {
- cs, p, pe := 0, 0, len(data)
- cap := maxTokens
- if cap < 0 {
- cap = 1000
- }
- if val == nil {
- val = make([][]byte, 0, cap)
- }
- if types == nil {
- types = make([]int, 0, cap)
- }
-
- // added for scanner
- ts := 0
- te := 0
- act := 0
- eof := pe
- _ = ts // compiler not happy
- _ = te
- _ = act
-
- // our state
- startPos := 0
- endPos := 0
- totalConsumed := 0
- %%{
-
- include SCRIPTS "ragel/uscript.rl";
- include WB "ragel/uwb.rl";
-
- action startToken {
- startPos = p
- }
-
- action endToken {
- endPos = p
- }
-
- action finishNumericToken {
- if !atEOF {
- return val, types, totalConsumed, nil
- }
-
- val = append(val, data[startPos:endPos+1])
- types = append(types, Number)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- action finishHangulToken {
- if endPos+1 == pe && !atEOF {
- return val, types, totalConsumed, nil
- } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
- return val, types, totalConsumed, nil
- }
-
- val = append(val, data[startPos:endPos+1])
- types = append(types, Letter)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- action finishKatakanaToken {
- if endPos+1 == pe && !atEOF {
- return val, types, totalConsumed, nil
- } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
- return val, types, totalConsumed, nil
- }
-
- val = append(val, data[startPos:endPos+1])
- types = append(types, Ideo)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- action finishWordToken {
- if !atEOF {
- return val, types, totalConsumed, nil
- }
- val = append(val, data[startPos:endPos+1])
- types = append(types, Letter)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- action finishHanToken {
- if endPos+1 == pe && !atEOF {
- return val, types, totalConsumed, nil
- } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
- return val, types, totalConsumed, nil
- }
-
- val = append(val, data[startPos:endPos+1])
- types = append(types, Ideo)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- action finishHiraganaToken {
- if endPos+1 == pe && !atEOF {
- return val, types, totalConsumed, nil
- } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
- return val, types, totalConsumed, nil
- }
-
- val = append(val, data[startPos:endPos+1])
- types = append(types, Ideo)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- action finishNoneToken {
- lastPos := startPos
- for lastPos <= endPos {
- _, size := utf8.DecodeRune(data[lastPos:])
- lastPos += size
- }
- endPos = lastPos -1
- p = endPos
-
- if endPos+1 == pe && !atEOF {
- return val, types, totalConsumed, nil
- } else if dr, size := utf8.DecodeRune(data[endPos+1:]); dr == utf8.RuneError && size == 1 {
- return val, types, totalConsumed, nil
- }
- // otherwise, consume this as well
- val = append(val, data[startPos:endPos+1])
- types = append(types, None)
- totalConsumed = endPos+1
- if maxTokens > 0 && len(val) >= maxTokens {
- return val, types, totalConsumed, nil
- }
- }
-
- HangulEx = Hangul ( Extend | Format )*;
- HebrewOrALetterEx = ( Hebrew_Letter | ALetter ) ( Extend | Format )*;
- NumericEx = Numeric ( Extend | Format )*;
- KatakanaEx = Katakana ( Extend | Format )*;
- MidLetterEx = ( MidLetter | MidNumLet | Single_Quote ) ( Extend | Format )*;
- MidNumericEx = ( MidNum | MidNumLet | Single_Quote ) ( Extend | Format )*;
- ExtendNumLetEx = ExtendNumLet ( Extend | Format )*;
- HanEx = Han ( Extend | Format )*;
- HiraganaEx = Hiragana ( Extend | Format )*;
- SingleQuoteEx = Single_Quote ( Extend | Format )*;
- DoubleQuoteEx = Double_Quote ( Extend | Format )*;
- HebrewLetterEx = Hebrew_Letter ( Extend | Format )*;
- RegionalIndicatorEx = Regional_Indicator ( Extend | Format )*;
- NLCRLF = Newline | CR | LF;
- OtherEx = ^(NLCRLF) ( Extend | Format )* ;
-
- # UAX#29 WB8. Numeric × Numeric
- # WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
- # WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
- # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
- # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
- #
- WordNumeric = ( ( ExtendNumLetEx )* NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )* ( ExtendNumLetEx )* ) >startToken @endToken;
-
- # subset of the below for typing purposes only!
- WordHangul = ( HangulEx )+ >startToken @endToken;
- WordKatakana = ( KatakanaEx )+ >startToken @endToken;
-
- # UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
- # WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
- # WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
- # WB7a. Hebrew_Letter × Single_Quote
- # WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
- # WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
- # WB9. (ALetter | Hebrew_Letter) × Numeric
- # WB10. Numeric × (ALetter | Hebrew_Letter)
- # WB13. Katakana × Katakana
- # WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
- # WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
- #
- # Marty -deviated here to allow for (ExtendNumLetEx x ExtendNumLetEx) part of 13a
- #
- Word = ( ( ExtendNumLetEx )* ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
- | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
- | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
- | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
- |ExtendNumLetEx
- )+
- )
- (
- ( ExtendNumLetEx )+ ( KatakanaEx ( ( ExtendNumLetEx )* KatakanaEx )*
- | ( HebrewLetterEx ( SingleQuoteEx | DoubleQuoteEx HebrewLetterEx )
- | NumericEx ( ( ( ExtendNumLetEx )* | MidNumericEx ) NumericEx )*
- | HebrewOrALetterEx ( ( ( ExtendNumLetEx )* | MidLetterEx ) HebrewOrALetterEx )*
- )+
- )
- )* ExtendNumLetEx*) >startToken @endToken;
-
- # UAX#29 WB14. Any ÷ Any
- WordHan = HanEx >startToken @endToken;
- WordHiragana = HiraganaEx >startToken @endToken;
-
- WordExt = ( ( Extend | Format )* ) >startToken @endToken; # maybe plus not star
-
- WordCRLF = (CR LF) >startToken @endToken;
-
- WordCR = CR >startToken @endToken;
-
- WordLF = LF >startToken @endToken;
-
- WordNL = Newline >startToken @endToken;
-
- WordRegional = (RegionalIndicatorEx+) >startToken @endToken;
-
- Other = OtherEx >startToken @endToken;
-
- main := |*
- WordNumeric => finishNumericToken;
- WordHangul => finishHangulToken;
- WordKatakana => finishKatakanaToken;
- Word => finishWordToken;
- WordHan => finishHanToken;
- WordHiragana => finishHiraganaToken;
- WordRegional =>finishNoneToken;
- WordCRLF => finishNoneToken;
- WordCR => finishNoneToken;
- WordLF => finishNoneToken;
- WordNL => finishNoneToken;
- WordExt => finishNoneToken;
- Other => finishNoneToken;
- *|;
-
- write init;
- write exec;
- }%%
-
- if cs < s_first_final {
- return val, types, totalConsumed, ParseError
- }
-
- return val, types, totalConsumed, nil
- }
|