本站源代码
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

840 lines
16KB

  1. package porterstemmer
  2. import (
  3. // "log"
  4. "unicode"
  5. )
  6. func isConsonant(s []rune, i int) bool {
  7. //DEBUG
  8. //log.Printf("isConsonant: [%+v]", string(s[i]))
  9. result := true
  10. switch s[i] {
  11. case 'a', 'e', 'i', 'o', 'u':
  12. result = false
  13. case 'y':
  14. if 0 == i {
  15. result = true
  16. } else {
  17. result = !isConsonant(s, i-1)
  18. }
  19. default:
  20. result = true
  21. }
  22. return result
  23. }
  24. func measure(s []rune) uint {
  25. // Initialize.
  26. lenS := len(s)
  27. result := uint(0)
  28. i := 0
  29. // Short Circuit.
  30. if 0 == lenS {
  31. /////////// RETURN
  32. return result
  33. }
  34. // Ignore (potential) consonant sequence at the beginning of word.
  35. for isConsonant(s, i) {
  36. //DEBUG
  37. //log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i]))
  38. i++
  39. if i >= lenS {
  40. /////////////// RETURN
  41. return result
  42. }
  43. }
  44. // For each pair of a vowel sequence followed by a consonant sequence, increment result.
  45. Outer:
  46. for i < lenS {
  47. for !isConsonant(s, i) {
  48. //DEBUG
  49. //log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i]))
  50. i++
  51. if i >= lenS {
  52. /////////// BREAK
  53. break Outer
  54. }
  55. }
  56. for isConsonant(s, i) {
  57. //DEBUG
  58. //log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i]))
  59. i++
  60. if i >= lenS {
  61. result++
  62. /////////// BREAK
  63. break Outer
  64. }
  65. }
  66. result++
  67. }
  68. // Return
  69. return result
  70. }
  71. func hasSuffix(s, suffix []rune) bool {
  72. lenSMinusOne := len(s) - 1
  73. lenSuffixMinusOne := len(suffix) - 1
  74. if lenSMinusOne <= lenSuffixMinusOne {
  75. return false
  76. } else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice.
  77. /////// RETURN
  78. return false
  79. } else {
  80. for i := 0; i < lenSuffixMinusOne; i++ {
  81. if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] {
  82. /////////////// RETURN
  83. return false
  84. }
  85. }
  86. }
  87. return true
  88. }
  89. func containsVowel(s []rune) bool {
  90. lenS := len(s)
  91. for i := 0; i < lenS; i++ {
  92. if !isConsonant(s, i) {
  93. /////////// RETURN
  94. return true
  95. }
  96. }
  97. return false
  98. }
  99. func hasRepeatDoubleConsonantSuffix(s []rune) bool {
  100. // Initialize.
  101. lenS := len(s)
  102. result := false
  103. // Do it!
  104. if 2 > lenS {
  105. result = false
  106. } else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"?
  107. result = true
  108. } else {
  109. result = false
  110. }
  111. // Return,
  112. return result
  113. }
  114. func hasConsonantVowelConsonantSuffix(s []rune) bool {
  115. // Initialize.
  116. lenS := len(s)
  117. result := false
  118. // Do it!
  119. if 3 > lenS {
  120. result = false
  121. } else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) {
  122. result = true
  123. } else {
  124. result = false
  125. }
  126. // Return
  127. return result
  128. }
  129. func step1a(s []rune) []rune {
  130. // Initialize.
  131. var result []rune = s
  132. lenS := len(s)
  133. // Do it!
  134. if suffix := []rune("sses"); hasSuffix(s, suffix) {
  135. lenTrim := 2
  136. subSlice := s[:lenS-lenTrim]
  137. result = subSlice
  138. } else if suffix := []rune("ies"); hasSuffix(s, suffix) {
  139. lenTrim := 2
  140. subSlice := s[:lenS-lenTrim]
  141. result = subSlice
  142. } else if suffix := []rune("ss"); hasSuffix(s, suffix) {
  143. result = s
  144. } else if suffix := []rune("s"); hasSuffix(s, suffix) {
  145. lenSuffix := 1
  146. subSlice := s[:lenS-lenSuffix]
  147. result = subSlice
  148. }
  149. // Return.
  150. return result
  151. }
  152. func step1b(s []rune) []rune {
  153. // Initialize.
  154. var result []rune = s
  155. lenS := len(s)
  156. // Do it!
  157. if suffix := []rune("eed"); hasSuffix(s, suffix) {
  158. lenSuffix := len(suffix)
  159. subSlice := s[:lenS-lenSuffix]
  160. m := measure(subSlice)
  161. if 0 < m {
  162. lenTrim := 1
  163. result = s[:lenS-lenTrim]
  164. }
  165. } else if suffix := []rune("ed"); hasSuffix(s, suffix) {
  166. lenSuffix := len(suffix)
  167. subSlice := s[:lenS-lenSuffix]
  168. if containsVowel(subSlice) {
  169. if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
  170. lenTrim := -1
  171. result = s[:lenS-lenSuffix-lenTrim]
  172. } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
  173. lenTrim := -1
  174. result = s[:lenS-lenSuffix-lenTrim]
  175. } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
  176. lenTrim := -1
  177. result = s[:lenS-lenSuffix-lenTrim]
  178. } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
  179. lenTrim := 1
  180. lenSubSlice := len(subSlice)
  181. result = subSlice[:lenSubSlice-lenTrim]
  182. } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
  183. lenTrim := -1
  184. result = s[:lenS-lenSuffix-lenTrim]
  185. result[len(result)-1] = 'e'
  186. } else {
  187. result = subSlice
  188. }
  189. }
  190. } else if suffix := []rune("ing"); hasSuffix(s, suffix) {
  191. lenSuffix := len(suffix)
  192. subSlice := s[:lenS-lenSuffix]
  193. if containsVowel(subSlice) {
  194. if suffix2 := []rune("at"); hasSuffix(subSlice, suffix2) {
  195. lenTrim := -1
  196. result = s[:lenS-lenSuffix-lenTrim]
  197. result[len(result)-1] = 'e'
  198. } else if suffix2 := []rune("bl"); hasSuffix(subSlice, suffix2) {
  199. lenTrim := -1
  200. result = s[:lenS-lenSuffix-lenTrim]
  201. result[len(result)-1] = 'e'
  202. } else if suffix2 := []rune("iz"); hasSuffix(subSlice, suffix2) {
  203. lenTrim := -1
  204. result = s[:lenS-lenSuffix-lenTrim]
  205. result[len(result)-1] = 'e'
  206. } else if c := subSlice[len(subSlice)-1]; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
  207. lenTrim := 1
  208. lenSubSlice := len(subSlice)
  209. result = subSlice[:lenSubSlice-lenTrim]
  210. } else if c := subSlice[len(subSlice)-1]; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
  211. lenTrim := -1
  212. result = s[:lenS-lenSuffix-lenTrim]
  213. result[len(result)-1] = 'e'
  214. } else {
  215. result = subSlice
  216. }
  217. }
  218. }
  219. // Return.
  220. return result
  221. }
  222. func step1c(s []rune) []rune {
  223. // Initialize.
  224. lenS := len(s)
  225. result := s
  226. // Do it!
  227. if 2 > lenS {
  228. /////////// RETURN
  229. return result
  230. }
  231. if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
  232. result[lenS-1] = 'i'
  233. } else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) {
  234. result[lenS-1] = 'I'
  235. }
  236. // Return.
  237. return result
  238. }
  239. func step2(s []rune) []rune {
  240. // Initialize.
  241. lenS := len(s)
  242. result := s
  243. // Do it!
  244. if suffix := []rune("ational"); hasSuffix(s, suffix) {
  245. if 0 < measure(s[:lenS-len(suffix)]) {
  246. result[lenS-5] = 'e'
  247. result = result[:lenS-4]
  248. }
  249. } else if suffix := []rune("tional"); hasSuffix(s, suffix) {
  250. if 0 < measure(s[:lenS-len(suffix)]) {
  251. result = result[:lenS-2]
  252. }
  253. } else if suffix := []rune("enci"); hasSuffix(s, suffix) {
  254. if 0 < measure(s[:lenS-len(suffix)]) {
  255. result[lenS-1] = 'e'
  256. }
  257. } else if suffix := []rune("anci"); hasSuffix(s, suffix) {
  258. if 0 < measure(s[:lenS-len(suffix)]) {
  259. result[lenS-1] = 'e'
  260. }
  261. } else if suffix := []rune("izer"); hasSuffix(s, suffix) {
  262. if 0 < measure(s[:lenS-len(suffix)]) {
  263. result = s[:lenS-1]
  264. }
  265. } else if suffix := []rune("bli"); hasSuffix(s, suffix) { // --DEPARTURE--
  266. // } else if suffix := []rune("abli") ; hasSuffix(s, suffix) {
  267. if 0 < measure(s[:lenS-len(suffix)]) {
  268. result[lenS-1] = 'e'
  269. }
  270. } else if suffix := []rune("alli"); hasSuffix(s, suffix) {
  271. if 0 < measure(s[:lenS-len(suffix)]) {
  272. result = s[:lenS-2]
  273. }
  274. } else if suffix := []rune("entli"); hasSuffix(s, suffix) {
  275. if 0 < measure(s[:lenS-len(suffix)]) {
  276. result = s[:lenS-2]
  277. }
  278. } else if suffix := []rune("eli"); hasSuffix(s, suffix) {
  279. if 0 < measure(s[:lenS-len(suffix)]) {
  280. result = s[:lenS-2]
  281. }
  282. } else if suffix := []rune("ousli"); hasSuffix(s, suffix) {
  283. if 0 < measure(s[:lenS-len(suffix)]) {
  284. result = s[:lenS-2]
  285. }
  286. } else if suffix := []rune("ization"); hasSuffix(s, suffix) {
  287. if 0 < measure(s[:lenS-len(suffix)]) {
  288. result[lenS-5] = 'e'
  289. result = s[:lenS-4]
  290. }
  291. } else if suffix := []rune("ation"); hasSuffix(s, suffix) {
  292. if 0 < measure(s[:lenS-len(suffix)]) {
  293. result[lenS-3] = 'e'
  294. result = s[:lenS-2]
  295. }
  296. } else if suffix := []rune("ator"); hasSuffix(s, suffix) {
  297. if 0 < measure(s[:lenS-len(suffix)]) {
  298. result[lenS-2] = 'e'
  299. result = s[:lenS-1]
  300. }
  301. } else if suffix := []rune("alism"); hasSuffix(s, suffix) {
  302. if 0 < measure(s[:lenS-len(suffix)]) {
  303. result = s[:lenS-3]
  304. }
  305. } else if suffix := []rune("iveness"); hasSuffix(s, suffix) {
  306. if 0 < measure(s[:lenS-len(suffix)]) {
  307. result = s[:lenS-4]
  308. }
  309. } else if suffix := []rune("fulness"); hasSuffix(s, suffix) {
  310. if 0 < measure(s[:lenS-len(suffix)]) {
  311. result = s[:lenS-4]
  312. }
  313. } else if suffix := []rune("ousness"); hasSuffix(s, suffix) {
  314. if 0 < measure(s[:lenS-len(suffix)]) {
  315. result = s[:lenS-4]
  316. }
  317. } else if suffix := []rune("aliti"); hasSuffix(s, suffix) {
  318. if 0 < measure(s[:lenS-len(suffix)]) {
  319. result = s[:lenS-3]
  320. }
  321. } else if suffix := []rune("iviti"); hasSuffix(s, suffix) {
  322. if 0 < measure(s[:lenS-len(suffix)]) {
  323. result[lenS-3] = 'e'
  324. result = result[:lenS-2]
  325. }
  326. } else if suffix := []rune("biliti"); hasSuffix(s, suffix) {
  327. if 0 < measure(s[:lenS-len(suffix)]) {
  328. result[lenS-5] = 'l'
  329. result[lenS-4] = 'e'
  330. result = result[:lenS-3]
  331. }
  332. } else if suffix := []rune("logi"); hasSuffix(s, suffix) { // --DEPARTURE--
  333. if 0 < measure(s[:lenS-len(suffix)]) {
  334. lenTrim := 1
  335. result = s[:lenS-lenTrim]
  336. }
  337. }
  338. // Return.
  339. return result
  340. }
  341. func step3(s []rune) []rune {
  342. // Initialize.
  343. lenS := len(s)
  344. result := s
  345. // Do it!
  346. if suffix := []rune("icate"); hasSuffix(s, suffix) {
  347. lenSuffix := len(suffix)
  348. if 0 < measure(s[:lenS-lenSuffix]) {
  349. result = result[:lenS-3]
  350. }
  351. } else if suffix := []rune("ative"); hasSuffix(s, suffix) {
  352. lenSuffix := len(suffix)
  353. subSlice := s[:lenS-lenSuffix]
  354. m := measure(subSlice)
  355. if 0 < m {
  356. result = subSlice
  357. }
  358. } else if suffix := []rune("alize"); hasSuffix(s, suffix) {
  359. lenSuffix := len(suffix)
  360. if 0 < measure(s[:lenS-lenSuffix]) {
  361. result = result[:lenS-3]
  362. }
  363. } else if suffix := []rune("iciti"); hasSuffix(s, suffix) {
  364. lenSuffix := len(suffix)
  365. if 0 < measure(s[:lenS-lenSuffix]) {
  366. result = result[:lenS-3]
  367. }
  368. } else if suffix := []rune("ical"); hasSuffix(s, suffix) {
  369. lenSuffix := len(suffix)
  370. if 0 < measure(s[:lenS-lenSuffix]) {
  371. result = result[:lenS-2]
  372. }
  373. } else if suffix := []rune("ful"); hasSuffix(s, suffix) {
  374. lenSuffix := len(suffix)
  375. subSlice := s[:lenS-lenSuffix]
  376. m := measure(subSlice)
  377. if 0 < m {
  378. result = subSlice
  379. }
  380. } else if suffix := []rune("ness"); hasSuffix(s, suffix) {
  381. lenSuffix := len(suffix)
  382. subSlice := s[:lenS-lenSuffix]
  383. m := measure(subSlice)
  384. if 0 < m {
  385. result = subSlice
  386. }
  387. }
  388. // Return.
  389. return result
  390. }
  391. func step4(s []rune) []rune {
  392. // Initialize.
  393. lenS := len(s)
  394. result := s
  395. // Do it!
  396. if suffix := []rune("al"); hasSuffix(s, suffix) {
  397. lenSuffix := len(suffix)
  398. subSlice := s[:lenS-lenSuffix]
  399. m := measure(subSlice)
  400. if 1 < m {
  401. result = result[:lenS-lenSuffix]
  402. }
  403. } else if suffix := []rune("ance"); hasSuffix(s, suffix) {
  404. lenSuffix := len(suffix)
  405. subSlice := s[:lenS-lenSuffix]
  406. m := measure(subSlice)
  407. if 1 < m {
  408. result = result[:lenS-lenSuffix]
  409. }
  410. } else if suffix := []rune("ence"); hasSuffix(s, suffix) {
  411. lenSuffix := len(suffix)
  412. subSlice := s[:lenS-lenSuffix]
  413. m := measure(subSlice)
  414. if 1 < m {
  415. result = result[:lenS-lenSuffix]
  416. }
  417. } else if suffix := []rune("er"); hasSuffix(s, suffix) {
  418. lenSuffix := len(suffix)
  419. subSlice := s[:lenS-lenSuffix]
  420. m := measure(subSlice)
  421. if 1 < m {
  422. result = subSlice
  423. }
  424. } else if suffix := []rune("ic"); hasSuffix(s, suffix) {
  425. lenSuffix := len(suffix)
  426. subSlice := s[:lenS-lenSuffix]
  427. m := measure(subSlice)
  428. if 1 < m {
  429. result = subSlice
  430. }
  431. } else if suffix := []rune("able"); hasSuffix(s, suffix) {
  432. lenSuffix := len(suffix)
  433. subSlice := s[:lenS-lenSuffix]
  434. m := measure(subSlice)
  435. if 1 < m {
  436. result = subSlice
  437. }
  438. } else if suffix := []rune("ible"); hasSuffix(s, suffix) {
  439. lenSuffix := len(suffix)
  440. subSlice := s[:lenS-lenSuffix]
  441. m := measure(subSlice)
  442. if 1 < m {
  443. result = subSlice
  444. }
  445. } else if suffix := []rune("ant"); hasSuffix(s, suffix) {
  446. lenSuffix := len(suffix)
  447. subSlice := s[:lenS-lenSuffix]
  448. m := measure(subSlice)
  449. if 1 < m {
  450. result = subSlice
  451. }
  452. } else if suffix := []rune("ement"); hasSuffix(s, suffix) {
  453. lenSuffix := len(suffix)
  454. subSlice := s[:lenS-lenSuffix]
  455. m := measure(subSlice)
  456. if 1 < m {
  457. result = subSlice
  458. }
  459. } else if suffix := []rune("ment"); hasSuffix(s, suffix) {
  460. lenSuffix := len(suffix)
  461. subSlice := s[:lenS-lenSuffix]
  462. m := measure(subSlice)
  463. if 1 < m {
  464. result = subSlice
  465. }
  466. } else if suffix := []rune("ent"); hasSuffix(s, suffix) {
  467. lenSuffix := len(suffix)
  468. subSlice := s[:lenS-lenSuffix]
  469. m := measure(subSlice)
  470. if 1 < m {
  471. result = subSlice
  472. }
  473. } else if suffix := []rune("ion"); hasSuffix(s, suffix) {
  474. lenSuffix := len(suffix)
  475. subSlice := s[:lenS-lenSuffix]
  476. m := measure(subSlice)
  477. c := subSlice[len(subSlice)-1]
  478. if 1 < m && ('s' == c || 't' == c) {
  479. result = subSlice
  480. }
  481. } else if suffix := []rune("ou"); hasSuffix(s, suffix) {
  482. lenSuffix := len(suffix)
  483. subSlice := s[:lenS-lenSuffix]
  484. m := measure(subSlice)
  485. if 1 < m {
  486. result = subSlice
  487. }
  488. } else if suffix := []rune("ism"); hasSuffix(s, suffix) {
  489. lenSuffix := len(suffix)
  490. subSlice := s[:lenS-lenSuffix]
  491. m := measure(subSlice)
  492. if 1 < m {
  493. result = subSlice
  494. }
  495. } else if suffix := []rune("ate"); hasSuffix(s, suffix) {
  496. lenSuffix := len(suffix)
  497. subSlice := s[:lenS-lenSuffix]
  498. m := measure(subSlice)
  499. if 1 < m {
  500. result = subSlice
  501. }
  502. } else if suffix := []rune("iti"); hasSuffix(s, suffix) {
  503. lenSuffix := len(suffix)
  504. subSlice := s[:lenS-lenSuffix]
  505. m := measure(subSlice)
  506. if 1 < m {
  507. result = subSlice
  508. }
  509. } else if suffix := []rune("ous"); hasSuffix(s, suffix) {
  510. lenSuffix := len(suffix)
  511. subSlice := s[:lenS-lenSuffix]
  512. m := measure(subSlice)
  513. if 1 < m {
  514. result = subSlice
  515. }
  516. } else if suffix := []rune("ive"); hasSuffix(s, suffix) {
  517. lenSuffix := len(suffix)
  518. subSlice := s[:lenS-lenSuffix]
  519. m := measure(subSlice)
  520. if 1 < m {
  521. result = subSlice
  522. }
  523. } else if suffix := []rune("ize"); hasSuffix(s, suffix) {
  524. lenSuffix := len(suffix)
  525. subSlice := s[:lenS-lenSuffix]
  526. m := measure(subSlice)
  527. if 1 < m {
  528. result = subSlice
  529. }
  530. }
  531. // Return.
  532. return result
  533. }
  534. func step5a(s []rune) []rune {
  535. // Initialize.
  536. lenS := len(s)
  537. result := s
  538. // Do it!
  539. if 'e' == s[lenS-1] {
  540. lenSuffix := 1
  541. subSlice := s[:lenS-lenSuffix]
  542. m := measure(subSlice)
  543. if 1 < m {
  544. result = subSlice
  545. } else if 1 == m {
  546. if c := subSlice[len(subSlice)-1]; !(hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) {
  547. result = subSlice
  548. }
  549. }
  550. }
  551. // Return.
  552. return result
  553. }
  554. func step5b(s []rune) []rune {
  555. // Initialize.
  556. lenS := len(s)
  557. result := s
  558. // Do it!
  559. if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] {
  560. lenSuffix := 1
  561. subSlice := s[:lenS-lenSuffix]
  562. m := measure(subSlice)
  563. if 1 < m {
  564. result = subSlice
  565. }
  566. }
  567. // Return.
  568. return result
  569. }
  570. func StemString(s string) string {
  571. // Convert string to []rune
  572. runeArr := []rune(s)
  573. // Stem.
  574. runeArr = Stem(runeArr)
  575. // Convert []rune to string
  576. str := string(runeArr)
  577. // Return.
  578. return str
  579. }
  580. func Stem(s []rune) []rune {
  581. // Initialize.
  582. lenS := len(s)
  583. // Short circuit.
  584. if 0 == lenS {
  585. /////////// RETURN
  586. return s
  587. }
  588. // Make all runes lowercase.
  589. for i := 0; i < lenS; i++ {
  590. s[i] = unicode.ToLower(s[i])
  591. }
  592. // Stem
  593. result := StemWithoutLowerCasing(s)
  594. // Return.
  595. return result
  596. }
  597. func StemWithoutLowerCasing(s []rune) []rune {
  598. // Initialize.
  599. lenS := len(s)
  600. // Words that are of length 2 or less is already stemmed.
  601. // Don't do anything.
  602. if 2 >= lenS {
  603. /////////// RETURN
  604. return s
  605. }
  606. // Stem
  607. s = step1a(s)
  608. s = step1b(s)
  609. s = step1c(s)
  610. s = step2(s)
  611. s = step3(s)
  612. s = step4(s)
  613. s = step5a(s)
  614. s = step5b(s)
  615. // Return.
  616. return s
  617. }
上海开阖软件有限公司 沪ICP备12045867号-1