From 51cb048d8d28c35b35c72fd12ac96b930e47feb2 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Sat, 8 Nov 2025 23:54:13 +0500 Subject: [PATCH 1/3] implementing fix --- .../nlp/annotators/common/TokenizedWithSentence.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala index 163dd884a98642..787bd7d6fdeede 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala @@ -26,7 +26,6 @@ object TokenizedWithSentence extends Annotated[TokenizedSentence] { val tokens = annotations .filter(_.annotatorType == annotatorType) .toArray - val sentences = SentenceSplit.unpack(annotations) /** // Evaluate whether to enable this validation to check proper usage of DOCUMENT and @@ -37,7 +36,11 @@ object TokenizedWithSentence extends Annotated[TokenizedSentence] { sentences .map(sentence => { val sentenceTokens = tokens - .filter(token => token.begin >= sentence.start & token.end <= sentence.end) + .filter(token => + token.begin >= sentence.start && + token.end <= sentence.end && + token.metadata.getOrElse("sentence", "0").toInt == sentence.index + ) .map(token => IndexedToken(token.result, token.begin, token.end)) sentenceTokens }) From 92eef0c09674facf0a46158cc611ebbe03ba16e8 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 10 Nov 2025 18:49:43 +0500 Subject: [PATCH 2/3] Introducing new test case --- ...46c9-904f-81967baf0b76-c000.snappy.parquet | Bin 0 -> 2089 bytes ...46c9-904f-81967baf0b76-c000.snappy.parquet | Bin 0 -> 10870 bytes .../embeddings/WordEmbeddingsTestSpec.scala | 30 ++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 src/test/resources/word-embedding/test-repeated-tokens/part-00000-13a8c543-e8bc-46c9-904f-81967baf0b76-c000.snappy.parquet create mode 100644 src/test/resources/word-embedding/test-repeated-tokens/part-00001-13a8c543-e8bc-46c9-904f-81967baf0b76-c000.snappy.parquet diff --git a/src/test/resources/word-embedding/test-repeated-tokens/part-00000-13a8c543-e8bc-46c9-904f-81967baf0b76-c000.snappy.parquet b/src/test/resources/word-embedding/test-repeated-tokens/part-00000-13a8c543-e8bc-46c9-904f-81967baf0b76-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8dec94cecca3792e5fb7d7c9c3c70863c945c216 GIT binary patch literal 2089 zcmeHJO>5gg5M2~Ib%T3USwjP6!O(y!ENnN9>812iiYcM=Rzg^5$FA^dDPUEYhtMNDS)=E9>xFd#a{_Sy zLPTn2_EqjS5My0?-bdXgb5Q$S|Cc78lUB3Ur8Lnqk1sfK>=kr-x1ddwC|d7(L(eDl zY{$dgNDq~9lY;eh^37XQM8BWhF^fF2F*8=Dp=Dh*jX(;+#$WE3;&EYGNeGRla}~t3 z=1V+Yi71y08&}qQAh%Rj;pRmk_}lucwss~P^991poDl*HiGD1JB?%-FkZm%xww(t3U&u_mB(nyN72nPPB@AodmFgWiIARJ!w jdt!9n3k3)NwGV^dNQi)+wSM4t`M`&qHVC=IKMB785#@g5 literal 0 HcmV?d00001 diff --git a/src/test/resources/word-embedding/test-repeated-tokens/part-00001-13a8c543-e8bc-46c9-904f-81967baf0b76-c000.snappy.parquet b/src/test/resources/word-embedding/test-repeated-tokens/part-00001-13a8c543-e8bc-46c9-904f-81967baf0b76-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1fa1fa37faffb501ff5e4e9c5bc589d1add99463 GIT binary patch literal 10870 zcmeI2eRva9zQ^aJnL3140(4>m5l_=Lw%UXwP2XZcN*ZV}v^6d7uajijM$#lEDNuzK z6_HiM7f@V9SAAF)xU6sMvM4LK>h8MA`nc-lde`-3y}qp%U)GDPcdxSddnTDCm4bNv z?>_SIX-_6+&Tr1!_nhyUb~Xo`qE( z70bnaSKah^ky$K6{z9>kp-f;bASHG*2JmL+q^L>2wI1h1zKD+Hg`i~J@rN- zu}DaY($+{S9ZwE=XnQ;+)0vT#a?DP>{;(>uC36;>3HXUrMFv zY&n)nCkIeMr<{z$5x0GEv(6WlL+NB(DLOhxyCS_YIZQhS(scSDZAsc`SAQhkSFI$y z5}r3%O}*Wf&0XHks!aKpJj;FBG*gbrN%nB@R3ef_g55epiqV-;Fd31fw9(;mI;gj^ zn|kA6xiVEnn?1Clp~g|G(@mekHAP~yU0y{O#FHziHF(P%{KPgT?Q1zy58 z=_*}LR~4`gX`pScFQ990Yo6I0 zXr^R5*G2y5piXytR*AvvO{Q2{3_>WTEw54PE z(X8 z(9}RurjZ_+Sh$3HmPS%E7EjXzs*X`>*LC;FviVvjwWDGRcUG(jBThN@;7mZvd z1?6<#1+?AXqbaiMp86%ZpgP_kPoV~j&>QUsYPq9Tgi0|~CxxdA4W{G07)|L&C}l@; zwY2!!TjtEB0bgfFpqqAg&>0A?i6OAqR# zl-4h&lY@4h&W?Z82!wX&L%lLhLFJW^upA4?6!RoJh{lyt5y`Hbk(?|gC?=9G(1vLh z@Ogb05G}NhcKO>{d|lMvJl*HV1xJm`=EUg6w7_%uyxq7qG@&t-JAK}^*}it#)iEdF z^GZaa&aO6JmoDI&;R^)X+Gk=wdfWVM z-3u^f7eFDWiPOc$ANuK!$22j)H5mMBoiYrskfq_UE%66V;7x(PizfWft z$EhMKcJj!;_uZl4n23(!HGFxShQrsJHSGTbFaPG?nN4C5i*YS^2))HQ%JHMeA-RK+ zv&U9j9}d~vE@y)c3RP>9L!kzDNRmRXV3622C$Zh*sHt~2T`t)cbkrbgofNb+G&t*R z^>q!wuoMov-N8myu8Y_ zR=s%e>Yw~xP_k#2p6$ZXz2s|m6tLQ}*2kj9v4v3&N`L`O1d{*-WuOAsKrN^TO`sXH zfs4Rguo%c70S3W3a22=~Yyz9XHgF%<0d|2s;9;;I{2DwBeg_VN*TG-FJK#O=0r&`f z0{#a63;Z|uAD|%|rv;jC13(DgHmuVm<-&Y4m5%b!AyYJ#dU+lU@3@zD2Rbe zK?W z@PSrvG3WvdKoG0|{U8n2fc0PlxCz_|c7WYr5BNFw1=tS`g5QGY!5_e3@D}(0due1(bpFfg3Q1@PG?JD_|1Q4HkkB zh=En$3a}1b1Fi>5B5noug8RTv!EW#%cnmxVehq#DUIDLxH^JNBUBD#b2w)QNDPR)u z1z-|E*z5sk06kz5VFng(9;g9zfJwv!zzf>IY|stzC1NQ)_ksjSfmL7)xDu=fOd>V| zCJ|czlZb7ANyK)*B;tPX0Qfmz67d*#0{jlBQ~wpD{|w#)sziK<^ihy65vRzaH8k`Bn0|O`p zlYtA=fvJEghZoESbHEZ11W^zNtH2dtBe(@H<+u$n<+v041Tf{;4ITuK0#!MlLi*d} zeIEH=1*&rV2htyckHArI3^3&&n4#l=5m*3Ij`M&WxIsPe056yUIssFTd0;t+0Hz#C za2dE7Fy+_`egy6Vy8u&;M*vff$G{Wd8Sorn%JB+#1H2EIa{MR2)??#Y;y=cVKYQ@- z`vpznL&{#|nb8i6N3IeXIA%*padp)hNN3F{ZxmoHh>`P<;&NtDQAy4(^Cg^}6bW#u zq{f`Jssfp0%$il=Dq?Px2t~}V67y=+vFdR5&=%Ll7Y|lr}m@%Jhnv<;w)PS<<1MK1d3+ElF~PUa_>L=VME9eo4mDlHJCvsU@S0Suaa= zm{m7R5(2y|KdB^RW{F>~*jX}BfT88v^@^hm0xT_gTwCiRFF~rSD)VeDnV_xmko6@QXY1>4 zCfY<_>Ag+G=&XEiThUVlZ(Br+&vGF_+GL_+m$90a^AOe4puXx z7S8t|v&$Lt%YS56EpMr*GSBq#eP-43mR6KxTrZiS?d~A@l8o~udab>RY)z=%cPtw7 z1p7xN4!csZge(DH?62%~u2XuhcUOn!`h?6~nIyl4>4GV;Zy2@}oy*^|zpr52&A+#*zz zTQy3ib1N-E)p_Sv+cb85^0}38wlx;PU0z#WhxNKaDYMbTUr^BG;dN8^Tx#lt{Iurj z<=z(GjPkmft^8C(Y!hY)9{wUfKbvdkJ2aga2P{ID&^@Pot}t)@f`y9~FIlv7+44D3 zFeHTKp5DI5B`c!+vGTfj;?gAI3aNBna$u#vty(>J+2yGWwnO}|!&&tx0WDi8R*Tmi zczdrV-}jdI*;cfmd(M#Y<(cu-Dqefb)33ie@lnW)g#Rl=YU3+gF z)>9M0UoWvx^8(?XBxJP)_gLV+Z`ifthkX84dF-^W-=j5+)tHQedi;@>6g4IoYUOKY zP&t!P%REQ^qRBXImdRf_+RF3u)*n#ASV*%0m{jOz2Q{@? zqc)1OQc=UcKSPen%KMKZPAsa>7>lJC)+z}n``v>Hxq|bXmC`FXBOjD|BQZomx{jp*}mShdNaL|5nnVF_}$Ath3Bwk*PvkXlfnmTBSSH#u(m#qQ+2Y z*hCCliFKA$Fy}NWmojjbL$f(|l`~aCN3|FvkUjQ`Gi;W`62p57#HpNhJ491ss^Cug z6q&)IL|<+QB7ajx=rS`HAr;xne8PM}3=Ay9xK9!dh8+tGv-dPX1!S z_*KDoUF%NoT^e&EUCLXc{I_YtUW7VEd^?(Z&Bj*WJY({0-m@uhCcpNkO?h>h??+;% zde^4DdmHhE8+-pY^u|qjjdx<<|A&`wG;2t{>r1#(Y&!BM>kE=-G^|@B;*8n4)u1Y0 z-^j{0wDROu8@9?|_Kq_gAckiMim>hx&6vQ-p5>Hc_a2jpyO)S{oOLJkDI>Y3A2_p| zDWSI(7;Y~>A=Xa~Ec9t@jf})Gf&hQXp6Qh(VjpMSIKeCmigcainex`pVatP}Wqv~^ zN@%LXb}J{nw**o8ua=6F(9{#v8nkK(vukno7nC?t zM)vav_1eiI|7oe{;;bJda&}=EA@b0wf`~`-hG+H257}qC!Q8@<`;=e113`?WP0QHA zvbXwFF|w;lCc;rYaw7uUSmq++KIKO^%pyFlM0oMyQ?*}iRY8#LX2b0!>x&m#cTZuX zgKbxhq-=yn`j{2}!7|axS=Y}~i|-qukxo7h5bic&xSOB=>r->F8Blk?xfFV9gsUSw z&V~^BZ1Zxq8Q3}hTRzUv%B!-6wim<*lkJX>JMjraxS-3;XMGI%WX~EFYAEEZA z825-4FwDjMO?Aq4erVJ22rK;KFUD9Xs2fTRcPGo}$QMTl%4L*+XD^ zjnA=xXjBSDQ5qgguC}O8jMCSxtsZ(ZW&fYMJ%ZQ#y>fEc{XJ4Nb@BqRU;8e%XPic^ zgm3%&L)AK=W&5QBip5%yY13TGD5hk@i{5xFjl<1UJK8SOZfj4wA#>Jds4_#{IASMT zaD*nY;Sw4aRcY;S3cCauhTq0I`P6q=hL0GJMLo%|Km>oG77n_d4K+?jW4Jb0U*ncT qwT(57a6`Q#6qaOXol|x@8p7eARAoAXLy*(_&s~RO@J{^2>Hh-wK|Bor literal 0 HcmV?d00001 diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala index 4b0c27a3751f39..f872b30531700b 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala @@ -23,6 +23,7 @@ import com.johnsnowlabs.nlp.{Annotation, AssertAnnotations} import com.johnsnowlabs.tags.{FastTest, SlowTest} import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.col import org.scalatest.flatspec.AnyFlatSpec class WordEmbeddingsTestSpec extends AnyFlatSpec with SparkSessionTest { @@ -31,6 +32,35 @@ class WordEmbeddingsTestSpec extends AnyFlatSpec with SparkSessionTest { .option("header", "true") .csv("src/test/resources/embeddings/clinical_words.txt") + + "Word Embeddings" should "Should not repeat tokens" taggedAs FastTest in { + + val loaded = spark.read.parquet("src/test/resources/word-embedding/test-repeated-tokens") + + val embeddings = WordEmbeddingsModel + .pretrained("glove_100d", "en") + .setInputCols(Array("splitter", "token")) + .setOutputCol("embedding") + + val pipeline = new Pipeline() + .setStages(Array( embeddings)) + + val model = pipeline.fit(loaded) + + val result = model.transform(loaded) + val duplicateBegins = result + .selectExpr("explode(embedding) as e") + .select(col("e.begin").alias("begin")) + .groupBy("begin") + .count() + .filter(col("count") > 2) + .count() + + assert(duplicateBegins == 0, s"Found $duplicateBegins repeated tokens (duplicate begin positions)") + + } + + "Word Embeddings" should "correctly embed clinical words not embed non-existent words" taggedAs SlowTest in { val notWords = spark.read From 38b8b13bca140f4ff0ee857ead7f60610cf86c8b Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Thu, 13 Nov 2025 12:25:40 +0100 Subject: [PATCH 3/3] scalafmt --- .../nlp/annotators/common/TokenizedWithSentence.scala | 3 +-- .../nlp/embeddings/WordEmbeddingsTestSpec.scala | 11 +++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala index 787bd7d6fdeede..06f66f763d8cd4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/TokenizedWithSentence.scala @@ -39,8 +39,7 @@ object TokenizedWithSentence extends Annotated[TokenizedSentence] { .filter(token => token.begin >= sentence.start && token.end <= sentence.end && - token.metadata.getOrElse("sentence", "0").toInt == sentence.index - ) + token.metadata.getOrElse("sentence", "0").toInt == sentence.index) .map(token => IndexedToken(token.result, token.begin, token.end)) sentenceTokens }) diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala index f872b30531700b..fbe863f67307a1 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTestSpec.scala @@ -32,18 +32,17 @@ class WordEmbeddingsTestSpec extends AnyFlatSpec with SparkSessionTest { .option("header", "true") .csv("src/test/resources/embeddings/clinical_words.txt") - "Word Embeddings" should "Should not repeat tokens" taggedAs FastTest in { val loaded = spark.read.parquet("src/test/resources/word-embedding/test-repeated-tokens") - val embeddings = WordEmbeddingsModel + val embeddings = WordEmbeddingsModel .pretrained("glove_100d", "en") .setInputCols(Array("splitter", "token")) .setOutputCol("embedding") val pipeline = new Pipeline() - .setStages(Array( embeddings)) + .setStages(Array(embeddings)) val model = pipeline.fit(loaded) @@ -56,11 +55,11 @@ class WordEmbeddingsTestSpec extends AnyFlatSpec with SparkSessionTest { .filter(col("count") > 2) .count() - assert(duplicateBegins == 0, s"Found $duplicateBegins repeated tokens (duplicate begin positions)") - + assert( + duplicateBegins == 0, + s"Found $duplicateBegins repeated tokens (duplicate begin positions)") } - "Word Embeddings" should "correctly embed clinical words not embed non-existent words" taggedAs SlowTest in { val notWords = spark.read