Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions mmda/predictors/heuristic_predictors/dictionary_word_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
"""

import string
from typing import Optional, Set, List
from typing import Any, Dict, Optional, Set, List

from mmda.predictors.base_predictors.base_predictor import BasePredictor
from mmda.types.annotation import Annotation, Span, SpanGroup
from mmda.types.document import Document
from mmda.types.names import Rows, Tokens

from ftfy import fix_text, TextFixerConfig


class DictionaryWordPredictor(BasePredictor):

Expand All @@ -20,7 +22,11 @@ class DictionaryWordPredictor(BasePredictor):

_dictionary: Optional[Set[str]] = None

def __init__(self, dictionary_file_path: str) -> None:
def __init__(
self,
dictionary_file_path: str,
ftfy_config: Optional[Dict[str, Any]] = None
) -> None:
"""Build a predictor that indexes the given dictionary file.
A dictionary is simply a case-sensitive list of words as a text file.
Words should be lower-case in the dictionary unless they are invalid
Expand All @@ -41,6 +47,9 @@ def __init__(self, dictionary_file_path: str) -> None:
"""
self.dictionary_file_path = dictionary_file_path

ftfy_config = ftfy_config or {"explain": False}
self.ftfy_config = TextFixerConfig(**ftfy_config)

@property
def dictionary(self) -> Set[str]:
"""Global dictionary and not document specific. This dictionary is the basis for
Expand Down Expand Up @@ -171,7 +180,9 @@ def predict(self, document: Document) -> List[SpanGroup]:
return words

def _token_text(self, token: SpanGroup) -> str:
return "".join(token.symbols)
text = "".join(token.symbols)
text = fix_text(text, config=self.ftfy_config)
return text

def _copy_token_with_text(self, token: SpanGroup) -> SpanGroup:
return SpanGroup(spans=token.spans, text=self._token_text(token))
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name="mmda",
description="mmda",
version="0.0.42",
version="0.0.43",
url="https://www.github.com/allenai/mmda",
python_requires=">= 3.7",
packages=find_namespace_packages(include=["mmda*", "ai2_internal*"]),
Expand All @@ -15,7 +15,8 @@
"pandas",
"pydantic",
"ncls",
"necessary"
"necessary",
"ftfy>=6.1.0,<7.0.0",
],
extras_require={
"dev": ["pytest"],
Expand Down
51 changes: 50 additions & 1 deletion tests/test_predictors/test_dictionary_word_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_optional_plurarl_words_combined(self):

def test_next_row_single_token(self):
# fmt:off
#0 10
#0 10
#012345678901
text = "Many lin-es"
# fmt:on
Expand Down Expand Up @@ -214,3 +214,52 @@ def test_next_row_single_token(self):
"Many lin-es",
" ".join([w.text for w in words]),
)


class TestFtfyFixes(unittest.TestCase):
def test_ftfy(self):
text = (
"Fact Verification, which reasons whether "
"a claim is supported / refuted by mult-"
"iple evidences"
)
spans = [
Span(start=0, end=4), # Fact
Span(start=5, end=17), # Verification,
Span(start=18, end=23), # which
Span(start=24, end=31), # reasons
Span(start=32, end=39), # whether
Span(start=40, end=41), # a
Span(start=42, end=47), # claim
Span(start=48, end=50), # is
Span(start=51, end=60), # supported
Span(start=61, end=62), # /
Span(start=63, end=70), # refuted
Span(start=71, end=73), # by
Span(start=74, end=79), # mult-
Span(start=79, end=83), # ple
Span(start=84, end=93), # evidences
]
rows = [
SpanGroup(id=1, spans=spans[:5]),
SpanGroup(id=2, spans=spans[5:13]),
SpanGroup(id=3, spans=spans[13:]),
]

doc = mock_document(symbols=text, spans=spans, rows=rows)

with tempfile.NamedTemporaryFile() as f:
f.write("multiple\n".encode("utf-8"))
f.flush()

predictor = DictionaryWordPredictor(
dictionary_file_path=f.name
)
words: List[SpanGroup] = predictor.predict(doc)

parsed = " ".join([str(w.text) for w in words])
reference = (
"Fact Verification, which reasons whether a claim is "
"supported / refuted by multiple evidences"
)
self.assertEqual(reference, parsed)
22 changes: 15 additions & 7 deletions tests/test_predictors/test_vila_predictors.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import json
from pathlib import Path

from PIL import Image

Expand Down Expand Up @@ -52,6 +53,8 @@

S2VL_LABEL_MAP = {int(key): val for key, val in S2VL_LABEL_MAP.items()}

FIXTURES_PATH = Path(__file__).parent.parent / "fixtures"


def test_vila_predictors():
layout_predictor = LayoutParserPredictor.from_pretrained(
Expand All @@ -61,8 +64,13 @@ def test_vila_predictors():
pdfplumber_parser = PDFPlumberParser()
rasterizer = PDF2ImageRasterizer()

doc = pdfplumber_parser.parse(input_pdf_path="tests/fixtures/1903.10676.pdf")
images = rasterizer.rasterize(input_pdf_path="tests/fixtures/1903.10676.pdf", dpi=72)
doc = pdfplumber_parser.parse(
input_pdf_path=str(FIXTURES_PATH /"1903.10676.pdf")
)
images = rasterizer.rasterize(
input_pdf_path=str(FIXTURES_PATH /"1903.10676.pdf"),
dpi=72
)
doc.annotate_images(images)

layout_regions = layout_predictor.predict(doc)
Expand Down Expand Up @@ -123,9 +131,9 @@ def test_vila_predictors():
assert [ele.type for ele in resA] == [S2VL_LABEL_MAP[ele.type] for ele in resB]

def test_vila_predictors_with_special_unicode_inputs():
test_doc_path = "tests/fixtures/unicode-test.json"

test_doc_path = FIXTURES_PATH / "unicode-test.json"

with open(test_doc_path, 'r') as fp:
res = json.load(fp)

Expand All @@ -136,4 +144,4 @@ def test_vila_predictors_with_special_unicode_inputs():
"allenai/ivila-row-layoutlm-finetuned-s2vl-v2"
)

ivilaA.predict(doc, subpage_per_run=2)
ivilaA.predict(doc, subpage_per_run=2)