google-research
diff --git a/‎config/default.py‎
Lines changed: 1 addition & 0 deletions b/‎config/default.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/data/codenet.py‎
Lines changed: 1 addition & 1 deletion b/‎core/data/codenet.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/data/codenet_paths.py‎
Lines changed: 4 additions & 4 deletions b/‎core/data/codenet_paths.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/data/data_io.py‎
Lines changed: 12 additions & 0 deletions b/‎core/data/data_io.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎core/data/process.py‎
Lines changed: 26 additions & 2 deletions b/‎core/data/process.py‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎core/lib/trainer.py‎
Lines changed: 2 additions & 1 deletion b/‎core/lib/trainer.py‎
Lines changed: 2 additions & 1 deletion
@@ -20,6 +20,7 @@ def default_config():
   config.experiment_id: Optional[Text] = ''  # An experiment is launched by a single command, may have multiple runs.
   config.run_id: Optional[Text] = ''  # A run is a single trainer run with a single set of hparams. run_id should identify hparams.
   config.notes: Optional[Text] = ''  # Any notes to record about the run.
+  config.use_in_dataset_field = True
 
   # Training configs
   config.optimizer = 'adam'  # sgd, adam
 
@@ -151,7 +151,7 @@ def get_python_major_version(problem_id, submission_id):
       'Python3',
       'Python (3.4.2)',
       'Python (3.4.3)',
-      'Python (3.8.2)'
+      'Python (3.8.2)',
       'PyPy3 (2.4.0)',
       'PyPy3 (7.3.0)',
   ]:
 
@@ -5,16 +5,16 @@
 import time
 
 DEFAULT_CONFIG_PATH = 'config/default.py'
-DEFAULT_DATASET_PATH = 'datasets/codenet/2021-11-01-f=0.01'
-TEST_DATASET_PATH = 'datasets/codenet/2021-11-01-f=0.01'
+DEFAULT_DATASET_PATH = 'datasets/codenet/2021-12-06-f=0.01'
+TEST_DATASET_PATH = 'datasets/codenet/2021-12-06-f=0.01'
 DEFAULT_TOKENIZER_PATH = 'out/tokenizers/train-1000000.json'
 DOCSTRING_TOKENIZER_PATH = 'out/tokenizers/train-docstrings-1000000.json'
 DEFAULT_SPLITS_PATH = 'out/splits/default.json'
 DEFAULT_EXPERIMENTS_DIR = 'out/experiments'
 EXPERIMENT_ID_PATH = 'out/experiment_id.txt'
 
-FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-10-07-full'
-FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-11-17'
+FULL_DATASET_PATH = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-06-nodoc'
+FULL_DATASET_PATH_WITH_DOCSTRINGS = '/mnt/runtime-error-problems-experiments/datasets/project-codenet/2021-12-06'
 # Raw control_flow_programs data pattern:
 DEFAULT_CFP_DATA_PATTERN = '/mnt/runtime-error-problems-experiments/datasets/control_flow_programs/decimal-large-state-L10/0.0.48/control_flow_programs-train.tfrecord-*'
 # Processed control_flow_programs dataset path:
 
@@ -19,6 +19,7 @@ def to_tf_example(problem):
   """Constructs a tf.train.Example for the process.RuntimeErrorProblem."""
   return tf.train.Example(features=tf.train.Features(feature={
       'tokens': _int64_feature(problem.tokens),
+      'docstring_tokens': _int64_feature(problem.docstring_tokens),
       'edge_sources': _int64_feature(problem.edge_sources),
       'edge_dests': _int64_feature(problem.edge_dests),
       'edge_types': _int64_feature(problem.edge_types),
@@ -39,6 +40,7 @@ def to_tf_example(problem):
       'problem_id': _bytes_feature([problem.problem_id]),
       'submission_id': _bytes_feature([problem.submission_id]),
 
+      'in_dataset': _int64_feature([problem.in_dataset]),
       'num_tokens': _int64_feature([len(problem.tokens)]),
       'num_nodes': _int64_feature([len(problem.true_branch_nodes)]),
       'num_edges': _int64_feature([len(problem.edge_sources)]),
@@ -48,6 +50,7 @@ def to_tf_example(problem):
 def decode_fn(record_bytes, include_strings=False):
   features = {
       'tokens': _int64_sequence_feature(),
+      'docstring_tokens': _int64_sequence_feature(),
       'edge_sources': _int64_sequence_feature(),
       'edge_dests': _int64_sequence_feature(),
       'edge_types': _int64_sequence_feature(),
@@ -65,6 +68,7 @@ def decode_fn(record_bytes, include_strings=False):
       'target_node_indexes': _int64_sequence_feature(),
       'num_target_nodes': _int64_scalar_feature(),
 
+      'in_dataset': _int64_scalar_feature(),
       'num_tokens': _int64_scalar_feature(),
       'num_nodes': _int64_scalar_feature(),
       'num_edges': _int64_scalar_feature(),
@@ -80,6 +84,7 @@ def decode_fn(record_bytes, include_strings=False):
 def get_fake_input(batch_size, max_tokens, max_num_nodes, max_num_edges):
   return {
       'tokens': jnp.ones((batch_size, max_tokens), dtype=jnp.int32),
+      'docstring_tokens': jnp.ones((batch_size, max_tokens), dtype=jnp.int32),
       'edge_sources': jnp.zeros((batch_size, max_num_edges), dtype=jnp.int32),
       'edge_dests': jnp.ones((batch_size, max_num_edges), dtype=jnp.int32),
       'edge_types': jnp.zeros((batch_size, max_num_edges), dtype=jnp.int32),
@@ -101,6 +106,7 @@ def get_fake_input(batch_size, max_tokens, max_num_nodes, max_num_edges):
       # 'problem_id': jnp.full((batch_size,), 'p12345', dtype=jnp.string),
       # 'submission_id': jnp.full((batch_size,), 's123456789', dtype=jnp.string),
 
+      'in_dataset': jnp.ones((batch_size, 1), dtype=jnp.int32),
       'num_tokens': jnp.full((batch_size, 1), max_tokens, dtype=jnp.int32),
       'num_nodes': jnp.full((batch_size, 1), max_num_nodes, dtype=jnp.int32),
       'num_edges': jnp.full((batch_size, 1), max_num_edges, dtype=jnp.int32),
@@ -113,6 +119,7 @@ def get_padded_shapes(max_tokens, max_num_nodes, max_num_edges, include_strings=
   max_target_nodes = 20
   shapes = {
       'tokens': [max_tokens],
+      'docstring_tokens': [max_tokens],
       'edge_sources': [max_num_edges],
       'edge_dests': [max_num_edges],
       'edge_types': [max_num_edges],
@@ -130,6 +137,7 @@ def get_padded_shapes(max_tokens, max_num_nodes, max_num_edges, include_strings=
       'target_node_indexes': [max_target_nodes],
       'num_target_nodes': [1],
 
+      'in_dataset': [1],
       'num_tokens': [1],
       'num_nodes': [1],
       'num_edges': [1],
@@ -146,6 +154,7 @@ def get_padded_shapes(max_tokens, max_num_nodes, max_num_edges, include_strings=
 def make_filter(
     max_tokens, max_num_nodes, max_num_edges, max_steps, allowlist=None,
     class_subsample_values=None,
+    use_in_dataset_field=True,
 ):
   """Makes a tf.Dataset filter function.
 
@@ -179,6 +188,9 @@ def fn(example):
         class_ok |= (target == index)
       allowed = allowed & class_ok
 
+    if use_in_dataset_field:
+      allowed &= tf.squeeze(example['in_dataset'] == 1, axis=-1)
+
     # Filter x% of examples with target == 1 (the most common class).
     if class_subsample_values is not None:
       for key, value in class_subsample_values.items():
 
@@ -41,6 +41,7 @@ class RawRuntimeErrorProblem:
 class RuntimeErrorProblem:
   """RuntimeErrorProblem for use on an accelerator."""
   tokens: List[int]
+  docstring_tokens: List[int]
   problem_id: Text
   submission_id: Text
   edge_sources: List[int]
@@ -58,6 +59,7 @@ class RuntimeErrorProblem:
   target: int
   target_lineno: Optional[int]
   target_node_indexes: List[int]
+  in_dataset: bool
 
 
 def get_character_index(source, lineno, col_offset):
@@ -380,17 +382,38 @@ def get_nodes_at_lineno(raw, lineno):
   return overlapping_nodes
 
 
-def make_runtimeerrorproblem(source, target, target_lineno=0, tokenizer=None,
-                             problem_id=None, submission_id=None):
+def hardcoded_filter(tokens_extended):
+  return len(tokens_extended) <= 512
+
+
+def make_runtimeerrorproblem(
+    source, target, docstring=None, extended_source=None,
+    target_lineno=0, tokenizer=None,
+    problem_id=None, submission_id=None):
   raw = make_rawruntimeerrorproblem(
         source, target, target_lineno=target_lineno,
         problem_id=problem_id, submission_id=submission_id)
   tokenizer = tokenizer or tokenization.load_tokenizer()
   token_data = tokenize_raw_with_spans(tokenizer, raw)
+
+  if extended_source is not None and extended_source != source:
+    extended_tokenized = tokenizer(extended_source)
+    tokens_extended = extended_tokenized['input_ids']
+  else:
+    tokens_extended = token_data['tokens']
+  if docstring is not None:
+    docstring_tokenized = tokenizer(docstring)
+    docstring_tokens = docstring_tokenized['input_ids']
+  else:
+    docstring_tokens = []
+
+  in_dataset = hardcoded_filter(tokens_extended)
+
   branch_list = np.array(raw.branch_list)
   target_node_indexes = get_nodes_at_lineno(raw, target_lineno)
   return RuntimeErrorProblem(
       tokens=token_data['tokens'],
+      docstring_tokens=docstring_tokens,
       problem_id=raw.problem_id,
       submission_id=raw.submission_id,
       edge_sources=raw.edge_sources,
@@ -408,6 +431,7 @@ def make_runtimeerrorproblem(source, target, target_lineno=0, tokenizer=None,
       target=raw.target,
       target_lineno=raw.target_lineno,
       target_node_indexes=target_node_indexes,
+      in_dataset=in_dataset,
   )
 
 
 
@@ -65,7 +65,8 @@ def load_dataset(
       allowlist = error_kinds.TIER1_ERROR_IDS
     filter_fn = data_io.make_filter(
         config.max_tokens, config.max_num_nodes, config.max_num_edges,
-        config.max_steps, allowlist=allowlist, class_subsample_values={1: 0.0660801055})
+        config.max_steps, allowlist=allowlist, class_subsample_values={1: 0.0660801055},
+        use_in_dataset_field=config.use_in_dataset_field)
 
     if config.binary_targets:
       map_fn = functools.partial(data_io.binarize_targets, dataset_path=dataset_path)