|
9 | 9 | */ |
10 | 10 | class SentenceTokenizer extends TokenizerAbstract |
11 | 11 | { |
| 12 | + /** |
| 13 | + * |
| 14 | + * @var string |
| 15 | + */ |
| 16 | + protected $separator = null; |
| 17 | + |
| 18 | + public function __construct($separator = "\n\n") |
| 19 | + { |
| 20 | + $this->separator = $separator; |
| 21 | + } |
| 22 | + |
| 23 | + /** |
| 24 | + * |
| 25 | + * @return string |
| 26 | + */ |
| 27 | + public function getSeparator() |
| 28 | + { |
| 29 | + return $this->separator; |
| 30 | + } |
| 31 | + |
| 32 | + |
| 33 | + /** |
| 34 | + * |
| 35 | + * @param string $string |
| 36 | + * @return array |
| 37 | + */ |
12 | 38 | public function tokenize($string) |
13 | 39 | { |
| 40 | + $strings = explode($this->getSeparator(), $string); |
| 41 | + $sentenceTokens = []; |
| 42 | + foreach($strings as $str) |
| 43 | + { |
| 44 | + if(empty(trim($str))) { |
| 45 | + continue; |
| 46 | + } |
| 47 | + $sentences = $this->tokenizeSentence($str); |
| 48 | + foreach($sentences as $sentence) |
| 49 | + { |
| 50 | + $sentenceTokens[] = $sentence; |
| 51 | + } |
| 52 | + } |
| 53 | + return $sentenceTokens; |
| 54 | + } |
| 55 | + |
| 56 | + |
| 57 | + protected function tokenizeSentence($string) |
| 58 | + { |
| 59 | + |
| 60 | + |
14 | 61 | $before_regexes = array('/(?:(?:[\'\"„][\.!?…][\'\"”]\s)|(?:[^\.]\s[A-Z]\.\s)|(?:\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\.\s)|(?:\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\.\s[A-Z]\.\s)|(?:\bApr\.\s)|(?:\bAug\.\s)|(?:\bBros\.\s)|(?:\bCo\.\s)|(?:\bCorp\.\s)|(?:\bDec\.\s)|(?:\bDist\.\s)|(?:\bFeb\.\s)|(?:\bInc\.\s)|(?:\bJan\.\s)|(?:\bJul\.\s)|(?:\bJun\.\s)|(?:\bMar\.\s)|(?:\bNov\.\s)|(?:\bOct\.\s)|(?:\bPh\.?D\.\s)|(?:\bSept?\.\s)|(?:\b\p{Lu}\.\p{Lu}\.\s)|(?:\b\p{Lu}\.\s\p{Lu}\.\s)|(?:\bcf\.\s)|(?:\be\.g\.\s)|(?:\besp\.\s)|(?:\bet\b\s\bal\.\s)|(?:\bvs\.\s)|(?:\p{Ps}[!?]+\p{Pe} ))\Z/su', |
15 | 62 | '/(?:(?:[\.\s]\p{L}{1,2}\.\s))\Z/su', |
16 | 63 | '/(?:(?:[\[\(]*\.\.\.[\]\)]* ))\Z/su', |
@@ -75,8 +122,6 @@ public function tokenize($string) |
75 | 122 | // perform some cleanup, and re-index the array |
76 | 123 | return array_values(array_filter(array_map('trim',$sentences))); |
77 | 124 | } |
78 | | - |
79 | | - |
80 | | - |
| 125 | + |
81 | 126 | } |
82 | 127 |
|
0 commit comments