Feat/chipper v3 (#308)

ajjimeno · web-flow · commit 4a4b64aaa3e8 · 2023-12-20T16:23:04.000+11:00
This new version of Chipper should largely improve the output for tables. In the attached file, the output looked as having many cells spread across multiple columns, and largely because of the $ character, which was inconsistently annotated in the Odetta set. As well colspan did not work properly for the header. This new version of Chipper does not predict thead and tbody tokens for tables. To test it, you need to run the code below. It will print the predicted elements. The code should print only one page and one element. The element has a field name text_as_html. The HTML within that field can be pasted in a new file renamed as html to be open with a browser. Example with Chipperv2 <img width="1146" alt="image" src="https://github.com/Unstructured-IO/unstructured-inference/assets/3939469/feffe674-8c9b-4c64-bd6d-08bd602c596a"> Example with Chipperv3 <img width="666" alt="image" src="https://github.com/Unstructured-IO/unstructured-inference/assets/3939469/f06867a9-2636-4055-a158-42badc58dd09"> <img width="677" alt="apple" src="https://github.com/Unstructured-IO/unstructured-inference/assets/3939469/d7ec628e-0dca-409c-894a-612350fce71f"> ``` from unstructured_inference.inference.layout import DocumentLayout from unstructured_inference.models.base import get_model model = get_model("chipper") doc = DocumentLayout.from_image_file("[point to the location of the file]/apple.png", detection_model=model) for i in range(len(doc.pages)): print(f"********** Page {i}") print(*[element.__dict__ for element in doc.pages[i].elements], sep="\n") ``` --------- Co-authored-by: Antonio Jimeno Yepes <antonio@unstructured.io>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.20
+
+* chipper-v3: improved table prediction
+
 ## 0.7.19
 
 * refactor: remove all OCR related code
diff --git a/test_unstructured_inference/models/test_chippermodel.py b/test_unstructured_inference/models/test_chippermodel.py
@@ -139,13 +139,8 @@ def test_no_repeat_ngram_logits():
 
     no_repeat_ngram_size = 2
 
-    output = chipper._no_repeat_ngram_logits(
-        input_ids=input_ids,
-        cur_len=cur_len,
-        logits=logits,
-        batch_size=batch_size,
-        no_repeat_ngram_size=no_repeat_ngram_size,
-    )
+    logitsProcessor = chipper.NoRepeatNGramLogitsProcessor(ngram_size=2)
+    output = logitsProcessor(input_ids=input_ids, scores=logits)
 
     assert (
         int(
@@ -194,6 +189,25 @@ def test_no_repeat_ngram_logits():
     )
 
 
+def test_ngram_repetiton_stopping_criteria():
+    input_ids = torch.tensor([[1, 2, 3, 4, 0, 1, 2, 3, 4]])
+    logits = torch.tensor([[0.1, -0.3, -0.5, 0, 1.0, -0.9]])
+
+    stoppingCriteria = chipper.NGramRepetitonStoppingCriteria(
+        repetition_window=2, skip_tokens={0, 1, 2, 3, 4}
+    )
+
+    output = stoppingCriteria(input_ids=input_ids, scores=logits)
+
+    assert output is False
+
+    stoppingCriteria = chipper.NGramRepetitonStoppingCriteria(
+        repetition_window=2, skip_tokens={1, 2, 3, 4}
+    )
+    output = stoppingCriteria(input_ids=input_ids, scores=logits)
+    assert output is True
+
+
 @pytest.mark.parametrize(
     ("decoded_str", "expected_classes"),
     [
@@ -241,7 +255,51 @@ def test_postprocess_bbox(decoded_str, expected_classes):
         assert out[i].type == expected_classes[i]
 
 
-def test_run_chipper_v2():
+def test_predict_tokens_beam_indices():
+    model = get_model("chipper")
+    model.stopping_criteria = [
+        chipper.NGramRepetitonStoppingCriteria(
+            repetition_window=1,
+            skip_tokens={},
+        ),
+    ]
+    img = Image.open("sample-docs/easy_table.jpg")
+    output = model.predict_tokens(image=img)
+    assert len(output) > 0
+
+
+def test_largest_margin_edge():
+    model = get_model("chipper")
+    img = Image.open("sample-docs/easy_table.jpg")
+    output = model.largest_margin(image=img, input_bbox=[0, 1, 0, 0], transpose=False)
+
+    assert output is None
+
+    output = model.largest_margin(img, [1, 1, 1, 1], False)
+
+    assert output is None
+
+    output = model.largest_margin(img, [2, 1, 3, 10], True)
+
+    assert output == (0, 0, 0)
+
+
+def test_deduplicate_detected_elements():
+    model = get_model("chipper")
+    img = Image.open("sample-docs/easy_table.jpg")
+    elements = model(img)
+
+    output = model.deduplicate_detected_elements(elements)
+
+    assert len(output) == 2
+
+
+def test_norepeatnGramlogitsprocessor_exception():
+    with pytest.raises(ValueError):
+        chipper.NoRepeatNGramLogitsProcessor(ngram_size="")
+
+
+def test_run_chipper_v3():
     model = get_model("chipper")
     img = Image.open("sample-docs/easy_table.jpg")
     elements = model(img)
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.19"  # pragma: no cover
+__version__ = "0.7.20"  # pragma: no cover
diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py
@@ -13,10 +13,19 @@ class Source(Enum):
     CHIPPER = "chipper"
     CHIPPERV1 = "chipperv1"
     CHIPPERV2 = "chipperv2"
+    CHIPPERV3 = "chipperv3"
     MERGED = "merged"
     SUPER_GRADIENTS = "super-gradients"
 
 
+CHIPPER_VERSIONS = (
+    Source.CHIPPER,
+    Source.CHIPPERV1,
+    Source.CHIPPERV2,
+    Source.CHIPPERV3,
+)
+
+
 class ElementType:
     IMAGE = "Image"
     FIGURE = "Figure"
@@ -37,3 +46,6 @@ class ElementType:
 
 
 FULL_PAGE_REGION_THRESHOLD = 0.99
+
+# this field is defined by pytesseract/unstructured.pytesseract
+TESSERACT_TEXT_HEIGHT = "height"
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -10,6 +10,7 @@
 
 from unstructured_inference.config import inference_config
 from unstructured_inference.constants import (
+    CHIPPER_VERSIONS,
     FULL_PAGE_REGION_THRESHOLD,
     ElementType,
     Source,
@@ -108,7 +109,7 @@ def merge_inferred_layout_with_extracted_layout(
                 continue
         region_matched = False
         for inferred_region in inferred_layout:
-            if inferred_region.source in (Source.CHIPPER, Source.CHIPPERV1):
+            if inferred_region.source in CHIPPER_VERSIONS:
                 continue
 
             if inferred_region.bbox.intersects(extracted_region.bbox):
diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py
@@ -15,7 +15,7 @@
 from transformers.generation.logits_process import LogitsProcessor
 from transformers.generation.stopping_criteria import StoppingCriteria
 
-from unstructured_inference.constants import Source
+from unstructured_inference.constants import CHIPPER_VERSIONS, Source
 from unstructured_inference.inference.elements import Rectangle
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.logger import logger
@@ -44,11 +44,22 @@
         "max_length": 1536,
         "heatmap_h": 40,
         "heatmap_w": 30,
+        "source": Source.CHIPPERV2,
+    },
+    "chipperv3": {
+        "pre_trained_model_repo": "unstructuredio/chipper-v3",
+        "swap_head": True,
+        "swap_head_hidden_layer_size": 128,
+        "start_token_prefix": "<s_",
+        "prompt": "<s><s_hierarchical>",
+        "max_length": 1536,
+        "heatmap_h": 40,
+        "heatmap_w": 30,
         "source": Source.CHIPPER,
     },
 }
 
-MODEL_TYPES["chipper"] = MODEL_TYPES["chipperv2"]
+MODEL_TYPES["chipper"] = MODEL_TYPES["chipperv3"]
 
 
 class UnstructuredChipperModel(UnstructuredElementExtractionModel):
@@ -390,7 +401,7 @@ def deduplicate_detected_elements(
         min_text_size: int = 15,
     ) -> List[LayoutElement]:
         """For chipper, remove elements from other sources."""
-        return [el for el in elements if el.source in (Source.CHIPPER, Source.CHIPPERV1)]
+        return [el for el in elements if el.source in CHIPPER_VERSIONS]
 
     def adjust_bbox(self, bbox, x_offset, y_offset, ratio, target_size):
         """Translate bbox by (x_offset, y_offset) and shrink by ratio."""
@@ -516,12 +527,13 @@ def reduce_element_bbox(
         Given a LayoutElement element, reduce the size of the bounding box,
         depending on existing elements
         """
-        bbox = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2]
+        if element.bbox:
+            bbox = [element.bbox.x1, element.bbox.y1, element.bbox.x2, element.bbox.y2]
 
-        if not self.element_overlap(elements, element):
-            element.bbox = Rectangle(*self.reduce_bbox_no_overlap(image, bbox))
-        else:
-            element.bbox = Rectangle(*self.reduce_bbox_overlap(image, bbox))
+            if not self.element_overlap(elements, element):
+                element.bbox = Rectangle(*self.reduce_bbox_no_overlap(image, bbox))
+            else:
+                element.bbox = Rectangle(*self.reduce_bbox_overlap(image, bbox))
 
     def bbox_overlap(
         self,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.19" # pragma: no cover`
	`1`	`+__version__ = "0.7.20" # pragma: no cover`