google-research
diff --git a/‎config/default.py‎
Lines changed: 1 addition & 0 deletions b/‎config/default.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/data/codenet_paths.py‎
Lines changed: 2 additions & 1 deletion b/‎core/data/codenet_paths.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/data/test_process.py‎
Lines changed: 136 additions & 0 deletions b/‎core/data/test_process.py‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎core/lib/metrics.py‎
Lines changed: 2 additions & 0 deletions b/‎core/lib/metrics.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/lib/test_metrics.py‎
Lines changed: 21 additions & 0 deletions b/‎core/lib/test_metrics.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎core/models/ipagnn.py‎
Lines changed: 0 additions & 6 deletions b/‎core/models/ipagnn.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎core/modules/ipagnn/ipagnn.py‎
Lines changed: 25 additions & 4 deletions b/‎core/modules/ipagnn/ipagnn.py‎
Lines changed: 25 additions & 4 deletions
@@ -63,6 +63,7 @@ def default_config():
   config.eval_metric_names: Tuple[str] = metrics.all_metric_names()
   config.eval_subsample = 1.0
   config.eval_max_batches = 30
+  config.unsupervised_localization: bool = True  # Must be set to True to compute localization logits.
 
   # Logging
   config.printoptions_threshold = 256
 
@@ -8,12 +8,13 @@
 DEFAULT_DATASET_PATH = 'datasets/codenet/2021-11-01-f=0.01'
 TEST_DATASET_PATH = 'datasets/codenet/2021-11-01-f=0.01'
 DEFAULT_TOKENIZER_PATH = 'out/tokenizers/train-1000000.json'
+DOCSTRING_TOKENIZER_PATH = 'out/tokenizers/train-docstrings-1000000.json'
 DEFAULT_SPLITS_PATH = 'out/splits/default.json'
 DEFAULT_EXPERIMENTS_DIR = 'out/experiments'
 EXPERIMENT_ID_PATH = 'out/experiment_id.txt'
 
 FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-10-07-full'
-FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-11-01'
+FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-11-17'
 # Raw control_flow_programs data pattern:
 DEFAULT_CFP_DATA_PATTERN = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs/decimal-large-state-L10/0.0.48/control_flow_programs-train.tfrecord-*'
 # Processed control_flow_programs dataset path:
 
@@ -314,6 +314,142 @@ def test_make_runtimeerrorproblem_try_finally_in_try_except(self):
     # a finally inside the try.
     # Can only get into "raising" territory via a finally block's true branch or via a raise edge.
 
+  def test_get_nodes_at_lineno_no_error(self):
+    lineno = 0
+    target = '1'
+    source = """x = 1
+while x < 2:
+  y = 3
+  while y < 4:
+    y += 5
+  x += 6
+"""
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [])
+
+  def test_get_nodes_at_lineno_1(self):
+    lineno = 1  # x = 1
+    target = '1'
+    source = """x = 1
+while x < 2:
+  y = 3
+  while y < 4:
+    y += 5
+  x += 6
+"""
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [0])
+
+  def test_get_nodes_at_lineno_2(self):
+    lineno = 2  # while x < 2:
+    target = '1'
+    source = """x = 1
+while x < 2:
+  y = 3
+  while y < 4:
+    y += 5
+  x += 6
+"""
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [1])
+
+  def test_get_nodes_at_lineno_docstring(self):
+    lineno = 5  # while x < 2:
+    target = '1'
+    source = '''"""Example
+docstring
+"""
+x = 1
+while x < 2:
+  y = 3
+  while y < 4:
+    y += 5
+  x += 6
+'''
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [2])
+
+  def test_get_nodes_at_lineno_for(self):
+    lineno = 5  # for y in range(100):
+    target = '1'
+    source = '''"""Example
+docstring
+"""
+x = 1
+for y in range(100):
+  while y < 4:
+    y += 5
+  x += 6
+'''
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [2, 3])
+
+  def test_get_nodes_at_lineno_multiline(self):
+    lineno = 6  # 100/0
+    target = '1'
+    source = '''"""Example
+docstring
+"""
+x = 1
+for y in range(
+  100/0
+):
+  while y < 4:
+    y += 5
+  x += 6
+'''
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [2])  # range(100/0)
+
+  def test_get_nodes_at_lineno_multiline_unpack(self):
+    lineno = 6  # for x,y in range(
+    target = '1'
+    source = r'''"""Example
+docstring
+"""
+x = 1
+for \
+x,y\
+ in range(100):
+  while y < 4:
+    y += 5
+  x += 6
+'''
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [3])
+
+  def test_get_nodes_at_lineno_multiline_ambiguous(self):
+    lineno = 5  # for x,y in range(
+    target = '1'
+    source = '''"""Example
+docstring
+"""
+x = 1
+for x,y in range(
+  100
+):
+  while y < 4:
+    y += 5
+  x += 6
+'''
+    raw = process.make_rawruntimeerrorproblem(
+        source, target, lineno)
+    nodes = process.get_nodes_at_lineno(raw, lineno)
+    self.assertEqual(nodes, [2, 3])
 
 if __name__ == '__main__':
   unittest.main()
@@ -201,6 +201,8 @@ def compute_localization_accuracy(
     return None
 
   def is_correct(targets, num_targets, prediction):
+    # targets.shape: max_num_targets
+    # num_targets.shape: scalar.
     is_example = num_targets > 0
     mask = jnp.arange(targets.shape[0]) < num_targets
     # mask.shape: max_num_nodes
 
@@ -151,5 +151,26 @@ def test_compute_weighted_f1_score_error_only_omits_correct_examples(self):
     # weighted average is 1/3.
     self.assertEqual(f1_score, 1/3)
 
+  def test_compute_localization_accuracy(self):
+    localization_targets = jnp.array([
+        [0, 1, 2, 0, 0, 0, 0],  # correct
+        [1, 2, 0, 0, 0, 0, 0],  # correct
+        [0, 0, 0, 0, 0, 0, 0],  # is_example == False
+        [0, 0, 0, 0, 0, 0, 0],  # correct
+        [0, 1, 2, 0, 0, 0, 0],  # incorrect
+        [1, 2, 0, 0, 0, 0, 0],  # incorrect
+        [0, 0, 0, 0, 0, 0, 0],  # is_example == False
+        [0, 0, 0, 0, 0, 0, 0],  # incorrect
+        [4, 5, 6, 0, 0, 0, 0],  # correct
+    ])
+    localization_num_targets = jnp.array([3, 2, 0, 1, 3, 2, 0, 1, 3])
+    localization_predictions = jnp.array([0, 2, 0, 0, 3, 0, 1, 1, 4])
+    acc = metrics.compute_localization_accuracy(
+        localization_targets,
+        localization_num_targets,
+        localization_predictions)
+    self.assertEqual(acc, 4/7)
+
+
 if __name__ == '__main__':
   unittest.main()
@@ -122,10 +122,4 @@ def __call__(self, x):
       )(exit_node_embeddings)
     # logits.shape: batch_size, num_classes
 
-    if config.raise_in_ipagnn:
-      per_node_raise_contributions = raise_contributions_lib.get_raise_contribution_from_batch_and_aux(
-          x, ipagnn_output)
-      localization_logits = per_node_raise_contributions
-      ipagnn_output['localization_logits'] = localization_logits
-
     return logits, ipagnn_output
@@ -6,6 +6,7 @@
 
 from core.lib.metrics import EvaluationMetric
 from core.modules.ipagnn import rnn
+from core.modules.ipagnn import raise_contributions as raise_contributions_lib
 
 
 def _rnn_state_to_embedding(hidden_state):
@@ -61,7 +62,7 @@ def __call__(
     config = self.config
 
     # State. Varies from step to step.
-    hidden_states, instruction_pointer, current_step = carry
+    hidden_states, instruction_pointer, attribution, current_step = carry
 
     # Inputs.
     vocab_size = info.vocab_size
@@ -229,6 +230,8 @@ def set_values(a, value, index):
       # raise_decision.shape: batch_size, num_nodes, 2
       # Make sure you cannot raise from the exit node.
       raise_decisions = batch_set(raise_decisions, jnp.array([0, 1]), exit_node_indexes)
+      # Make sure you cannot raise from the raise node.
+      raise_decisions = batch_set(raise_decisions, jnp.array([0, 1]), raise_node_indexes)
       # raise_decision.shape: batch_size, num_nodes, 2
     else:
       raise_decisions = jnp.concatenate([
@@ -257,6 +260,18 @@ def set_values(a, value, index):
         raise_node_indexes, true_indexes, false_indexes, raise_indexes)
     # leaves(hidden_states_new).shape: batch_size, num_nodes, hidden_size
 
+    attribution = raise_contributions_lib.get_raise_contribution_step_batch(
+        attribution,
+        instruction_pointer,
+        branch_decisions,
+        raise_decisions,
+        true_indexes,
+        false_indexes,
+        raise_indexes,
+        num_nodes,
+    )
+    # attribution.shape: batch_size, num_nodes, num_nodes
+
     # current_step.shape: batch_size
     # step_limits.shape: batch_size
     instruction_pointer_orig = instruction_pointer
@@ -281,7 +296,7 @@ def set_values(a, value, index):
         'hidden_state_contributions': hidden_state_contributions,
     }
     aux.update(aux_ip)
-    return (hidden_states, instruction_pointer, current_step), aux
+    return (hidden_states, instruction_pointer, attribution, current_step), aux
 
 
 class IPAGNNModule(nn.Module):
@@ -400,10 +415,12 @@ def make_instruction_pointer(start_node_index):
     instruction_pointer = jax.vmap(make_instruction_pointer)(start_node_indexes)
     # instruction_pointer.shape: batch_size, num_nodes
 
+    attribution = jnp.zeros((batch_size, num_nodes, num_nodes))
+
     # Run self.max_steps steps of IPAGNNLayer.
-    (hidden_states, instruction_pointer, current_step), aux = self.ipagnn_layer_scan(
+    (hidden_states, instruction_pointer, attribution, current_step), aux = self.ipagnn_layer_scan(
         # State:
-        (hidden_states, instruction_pointer, current_step),
+        (hidden_states, instruction_pointer, attribution, current_step),
         # Inputs:
         node_embeddings,
         edge_sources,
@@ -438,6 +455,10 @@ def get_hidden_state_single_example(hidden_states, node_index):
     raise_node_instruction_pointer = get_instruction_pointer_value(instruction_pointer, raise_node_indexes)
     # raise_node_instruction_pointer.shape: batch_size
 
+    if config.raise_in_ipagnn and config.unsupervised_localization:
+      localization_logits = attribution[jnp.arange(batch_size), raise_node_indexes]
+      aux['localization_logits'] = localization_logits
+
     aux.update({
         'exit_node_instruction_pointer': exit_node_instruction_pointer,
         'exit_node_embeddings': exit_node_embeddings,