code-gen · Jan 20, 2018
diff --git a/‎__init__.py
+1 b/‎__init__.py
+1
diff --git a/‎asdl/__init__.py b/‎asdl/__init__.py
diff --git a/‎asdl/asdl.py
+315 b/‎asdl/asdl.py
+315
diff --git a/‎asdl/asdl_ast.py
+180 b/‎asdl/asdl_ast.py
+180
diff --git a/‎asdl/hypothesis.py
+100 b/‎asdl/hypothesis.py
+100
diff --git a/‎asdl/lang/__init__.py b/‎asdl/lang/__init__.py
diff --git a/‎asdl/lang/py/__init__.py b/‎asdl/lang/py/__init__.py
diff --git a/‎asdl/lang/py/dataset.py
+313 b/‎asdl/lang/py/dataset.py
+313
diff --git a/‎asdl/lang/py/py_asdl_helper.py
+99 b/‎asdl/lang/py/py_asdl_helper.py
+99
diff --git a/‎asdl/lang/py/py_grammar.py
+41 b/‎asdl/lang/py/py_grammar.py
+41
diff --git a/‎asdl/lang/py/py_transition_system.py
+26 b/‎asdl/lang/py/py_transition_system.py
+26
diff --git a/‎asdl/logical_form.py
+242 b/‎asdl/logical_form.py
+242
diff --git a/‎asdl/transition_system.py
+109 b/‎asdl/transition_system.py
+109
diff --git a/‎asdl/utils.py
+9 b/‎asdl/utils.py
+9
diff --git a/‎components/__init__.py b/‎components/__init__.py
diff --git a/‎components/action_info.py
+16 b/‎components/action_info.py
+16
diff --git a/‎components/dataset.py
+183 b/‎components/dataset.py
+183
diff --git a/‎components/utils.py
+20 b/‎components/utils.py
+20
diff --git a/‎components/vocab.py
+82 b/‎components/vocab.py
+82
diff --git a/‎exp.py
+136 b/‎exp.py
+136
diff --git a/‎model/__init__.py b/‎model/__init__.py
diff --git a/‎model/nn_utils.py
+94 b/‎model/nn_utils.py
+94
diff --git a/‎model/parser.py
+409 b/‎model/parser.py
+409
diff --git a/‎model/pointer_net.py
+38 b/‎model/pointer_net.py
+38
diff --git a/‎model/seq2seq.py
+434 b/‎model/seq2seq.py
+434
@@ -0,0 +1 @@
+# coding=utf-8
@@ -0,0 +1,315 @@
+# coding=utf-8
+from itertools import chain
+
+import utils
+
+
+class ASDLGrammar(object):
+    """
+    Collection of types, constructors and productions
+    """
+    def __init__(self, productions):
+        # productions are indexed by their head types
+        self._productions = dict()
+        self._constructor_production_map = dict()
+        for prod in productions:
+            if prod.type not in self._productions:
+                self._productions[prod.type] = list()
+            self._productions[prod.type].append(prod)
+            self._constructor_production_map[prod.constructor.name] = prod
+
+        self.root_type = productions[0].type
+        # number of constructors
+        self.size = sum(len(head) for head in self._productions.itervalues())
+
+        # get entities to their ids map
+        self.prod2id = {prod: i for i, prod in enumerate(self.productions)}
+        self.type2id = {type: i for i, type in enumerate(self.types)}
+        self.field2id = {field: i for i, field in enumerate(self.fields)}
+
+        self.id2prod = {i: prod for i, prod in enumerate(self.productions)}
+        self.id2type = {i: type for i, type in enumerate(self.types)}
+        self.id2field = {i: field for i, field in enumerate(self.fields)}
+
+    def __len__(self):
+        return self.size
+
+    @property
+    def productions(self):
+        return sorted(chain.from_iterable(self._productions.itervalues()), key=lambda x: repr(x))
+
+    def __getitem__(self, datum):
+        if isinstance(datum, str):
+            return self._productions[ASDLType(datum)]
+        elif isinstance(datum, ASDLType):
+            return self._productions[datum]
+
+    def get_prod_by_ctr_name(self, name):
+        return self._constructor_production_map[name]
+
+    @property
+    def types(self):
+        if not hasattr(self, '_types'):
+            all_types = set()
+            for prod in self.productions:
+                all_types.add(prod.type)
+                all_types.update(map(lambda x: x.type, prod.constructor.fields))
+
+            self._types = sorted(all_types, key=lambda x: x.name)
+
+        return self._types
+
+    @property
+    def fields(self):
+        if not hasattr(self, '_fields'):
+            all_fields = set()
+            for prod in self.productions:
+                all_fields.update(prod.constructor.fields)
+
+            self._fields = sorted(all_fields, key=lambda x: x.name)
+
+        return self._fields
+
+    @property
+    def primitive_types(self):
+        return filter(lambda x: isinstance(x, ASDLPrimitiveType), self.types)
+
+    @property
+    def composite_types(self):
+        return filter(lambda x: isinstance(x, ASDLCompositeType), self.types)
+
+    def is_composite_type(self, asdl_type):
+        return asdl_type in self.composite_types
+
+    def is_primitive_type(self, asdl_type):
+        return asdl_type in self.primitive_types
+
+    @staticmethod
+    def from_text(text):
+        def _parse_field_from_text(_text):
+            d = _text.strip().split(' ')
+            name = d[1].strip()
+            type_str = d[0].strip()
+            cardinality = 'single'
+            if type_str[-1] == '*':
+                type_str = type_str[:-1]
+                cardinality = 'multiple'
+            elif type_str[-1] == '?':
+                type_str = type_str[:-1]
+                cardinality = 'optional'
+
+            if type_str in primitive_type_names:
+                return Field(name, ASDLPrimitiveType(type_str), cardinality=cardinality)
+            else:
+                return Field(name, ASDLCompositeType(type_str), cardinality=cardinality)
+
+        def _parse_constructor_from_text(_text):
+            _text = _text.strip()
+            fields = None
+            if '(' in _text:
+                name = _text[:_text.find('(')]
+                field_blocks = _text[_text.find('(') + 1:_text.find(')')].split(',')
+                fields = map(_parse_field_from_text, field_blocks)
+            else:
+                name = _text
+
+            if name == '': name = None
+
+            return ASDLConstructor(name, fields)
+
+        lines = utils.remove_comment(text).split('\n')
+        lines = map(lambda l: l.strip(), lines)
+        lines = filter(lambda l: l, lines)
+        line_no = 0
+
+        # first line is always the primitive types
+        primitive_type_names = map(lambda x: x.strip(), lines[line_no].split(','))
+        line_no += 1
+
+        all_productions = list()
+
+        while True:
+            type_block = lines[line_no]
+            type_name = type_block[:type_block.find('=')].strip()
+            constructors_blocks = type_block[type_block.find('=') + 1:].split('|')
+            i = line_no + 1
+            while i < len(lines) and lines[i].strip().startswith('|'):
+                t = lines[i].strip()
+                cont_constructors_blocks = t[1:].split('|')
+                constructors_blocks.extend(cont_constructors_blocks)
+
+                i += 1
+
+            constructors_blocks = filter(lambda x: x and x.strip(), constructors_blocks)
+
+            # parse type name
+            new_type = ASDLPrimitiveType(type_name) if type_name in primitive_type_names else ASDLCompositeType(type_name)
+            constructors = map(_parse_constructor_from_text, constructors_blocks)
+
+            productions = map(lambda c: ASDLProduction(new_type, c), constructors)
+            all_productions.extend(productions)
+
+            line_no = i
+            if line_no == len(lines):
+                break
+
+        grammar = ASDLGrammar(all_productions)
+        grammar.primitive_types
+
+        return grammar
+
+
+class ASDLProduction(object):
+    def __init__(self, type, constructor):
+        self.type = type
+        self.constructor = constructor
+
+    @property
+    def fields(self):
+        return self.constructor.fields
+
+    def __getitem__(self, field_name):
+        return self.constructor[field_name]
+
+    def __hash__(self):
+        h = hash(self.type) ^ hash(self.constructor)
+
+        return h
+
+    def __eq__(self, other):
+        return isinstance(other, ASDLProduction) and \
+               self.type == other.type and \
+               self.constructor == other.constructor
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        return '%s -> %s' % (self.type.__repr__(plain=True), self.constructor.__repr__(plain=True))
+
+
+class ASDLConstructor(object):
+    def __init__(self, name, fields=None):
+        self.name = name
+        self.fields = []
+        if fields:
+            self.fields = list(fields)
+
+    def __getitem__(self, field_name):
+        for field in self.fields:
+            if field.name == field_name: return field
+
+        raise KeyError
+
+    def __hash__(self):
+        h = hash(self.name)
+        for field in self.fields:
+            h ^= hash(field)
+
+        return h
+
+    def __eq__(self, other):
+        return isinstance(other, ASDLConstructor) and \
+               self.name == other.name and \
+               self.fields == other.fields
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self, plain=False):
+        plain_repr = '%s(%s)' % (self.name,
+                                 ', '.join(f.__repr__(plain=True) for f in self.fields))
+        if plain: return plain_repr
+        else: return 'Constructor(%s)' % plain_repr
+
+
+class Field(object):
+    def __init__(self, name, type, cardinality):
+        self.name = name
+        self.type = type
+
+        assert cardinality in ['single', 'optional', 'multiple']
+        self.cardinality = cardinality
+
+    def __hash__(self):
+        h = hash(self.name) ^ hash(self.type)
+        h ^= hash(self.cardinality)
+
+        return h
+
+    def __eq__(self, other):
+        return isinstance(other, Field) and \
+               self.name == other.name and \
+               self.type == other.type and \
+               self.cardinality == other.cardinality
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self, plain=False):
+        plain_repr = '%s%s %s' % (self.type.__repr__(plain=True),
+                                  Field.get_cardinality_repr(self.cardinality),
+                                  self.name)
+        if plain: return plain_repr
+        else: return 'Field(%s)' % plain_repr
+
+    @staticmethod
+    def get_cardinality_repr(cardinality):
+        return '' if cardinality == 'single' else '?' if cardinality == 'optional' else '*'
+
+
+class ASDLType(object):
+    def __init__(self, type_name):
+        self.name = type_name
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __eq__(self, other):
+        return isinstance(other, ASDLType) and self.name == other.name
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self, plain=False):
+        plain_repr = self.name
+        if plain: return plain_repr
+        else: return '%s(%s)' % (self.__class__.__name__, plain_repr)
+
+
+class ASDLCompositeType(ASDLType):
+    pass
+
+
+class ASDLPrimitiveType(ASDLType):
+    pass
+
+
+if __name__ == '__main__':
+    asdl_desc = """
+var, ent, num, var_type
+
+expr = Variable(var variable)
+| Entity(ent entity)
+| Number(num number)
+| Apply(pred predicate, expr* arguments)
+| Argmax(var variable, expr domain, expr body)
+| Argmin(var variable, expr domain, expr body)
+| Count(var variable, expr body)
+| Exists(var variable, expr body)
+| Lambda(var variable, var_type type, expr body)
+| Max(var variable, expr body)
+| Min(var variable, expr body)
+| Sum(var variable, expr domain, expr body)
+| The(var variable, expr body)
+| Not(expr argument)
+| And(expr* arguments)
+| Or(expr* arguments)
+| Compare(cmp_op op, expr left, expr right)
+
+cmp_op = GreaterThan | Equal | LessThan
+"""
+
+    grammar = ASDLGrammar.from_text(asdl_desc)
+    print(ASDLCompositeType('1') == ASDLPrimitiveType('1'))
+
@@ -0,0 +1,180 @@
+# coding=utf-8
+
+from cStringIO import StringIO
+
+from asdl import *
+from asdl import Field
+
+
+class AbstractSyntaxTree(object):
+    def __init__(self, production, realized_fields=None):
+        self.production = production
+
+        # a child is essentially a *realized_field*
+        self.fields = []
+
+        # record its parent field to which it's attached
+        self.parent_field = None
+
+        # used in decoding, record the time step when this node was created
+        self.created_time = 0
+
+        if realized_fields:
+            assert len(realized_fields) == len(self.production.fields)
+
+            for field in realized_fields:
+                self.add_child(field)
+        else:
+            for field in self.production.fields:
+                self.add_child(RealizedField(field))
+
+    def add_child(self, realized_field):
+        # if isinstance(realized_field.value, AbstractSyntaxTree):
+        #     realized_field.value.parent = self
+        self.fields.append(realized_field)
+        realized_field.parent_node = self
+
+    def sanity_check(self):
+        if len(self.production.fields) != len(self.fields):
+            raise ValueError('filed number must match')
+        for child in self.fields:
+            if isinstance(child.value, AbstractSyntaxTree):
+                child.value.sanity_check()
+
+    def copy(self):
+        new_tree = AbstractSyntaxTree(self.production)
+        new_tree.created_time = self.created_time
+        for i, old_field in enumerate(self.fields):
+            new_field = new_tree.fields[i]
+            if isinstance(old_field.type, ASDLCompositeType):
+                for value in old_field.as_value_list:
+                    new_field.add_value(value.copy())
+            else:
+                for value in old_field.as_value_list:
+                    new_field.add_value(value)
+
+        return new_tree
+
+    def to_string(self, sb=None):
+        is_root = False
+        if sb is None:
+            is_root = True
+            sb = StringIO()
+
+        sb.write('(')
+        sb.write(self.production.constructor.name)
+
+        for field in self.fields:
+            sb.write(' ')
+            sb.write('(')
+            sb.write(field.type.name)
+            sb.write(Field.get_cardinality_repr(field.cardinality))
+            sb.write('-')
+            sb.write(field.name)
+
+            if field.value is not None:
+                for val_node in field.as_value_list:
+                    sb.write(' ')
+                    if isinstance(field.type, ASDLCompositeType):
+                        val_node.to_string(sb)
+                    else:
+                        sb.write(str(val_node).replace(' ', '-SPACE-'))
+
+            sb.write(')')  # of field
+
+        sb.write(')')  # of node
+
+        if is_root:
+            return sb.getvalue()
+
+    def __hash__(self):
+        code = hash(self.production)
+        for field in self.fields:
+            code = code + 37 * hash(field)
+
+        return code
+
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return False
+
+        if self.production != other.production:
+            return False
+
+        if len(self.fields) != len(other.fields):
+            return False
+
+        for i in xrange(len(self.fields)):
+            if self.fields[i] != other.fields[i]: return False
+
+        return True
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        return repr(self.production)
+
+
+class RealizedField(Field):
+    """wrapper of field realized with values"""
+    def __init__(self, field, value=None, parent=None):
+        super(RealizedField, self).__init__(field.name, field.type, field.cardinality)
+
+        # record its parent AST node
+        self.parent_node = None
+
+        # FIXME: hack, return the field as a property
+        self.field = field
+
+        # initialize value to correct type
+        if self.cardinality == 'multiple':
+            self.value = []
+            if value:
+                for child_node in value:
+                    self.add_value(child_node)
+        else:
+            self.value = None
+            if value: self.add_value(value)
+
+        # properties only used in decoding, record if the field is finished generating
+        # when card in [optional, multiple]
+        self._not_single_cardinality_finished = False
+
+    def add_value(self, value):
+        if isinstance(value, AbstractSyntaxTree):
+            value.parent_field = self
+
+        if self.cardinality == 'multiple':
+            self.value.append(value)
+        else:
+            self.value = value
+
+    @property
+    def as_value_list(self):
+        """get value as an iterable"""
+        if self.cardinality == 'multiple': return self.value
+        elif self.value is not None: return [self.value]
+        else: return []
+
+    @property
+    def finished(self):
+        if self.cardinality == 'single':
+            if self.value is None: return False
+            else: return True
+        elif self.cardinality == 'optional' and self.value is not None:
+            return True
+        else:
+            if self._not_single_cardinality_finished: return True
+            else: return False
+
+    def set_finish(self):
+        # assert self.cardinality in ('optional', 'multiple')
+        self._not_single_cardinality_finished = True
+
+    def __eq__(self, other):
+        if super(RealizedField, self).__eq__(other):
+            if type(other) == Field: return True  # FIXME: hack, Field and RealizedField can compare!
+            if self.value == other.value: return True
+            else: return False
+        else: return False
@@ -0,0 +1,100 @@
+# coding=utf-8
+
+from asdl import *
+from asdl_ast import AbstractSyntaxTree
+from transition_system import *
+
+
+class Hypothesis(object):
+    def __init__(self):
+        self.tree = None
+        self.actions = []
+        self.score = 0.
+        self.frontier_node = None
+        self.frontier_field = None
+        self._value_buffer = []
+
+        # record the current time step
+        self.t = 0
+
+    def apply_action(self, action):
+        if self.tree is None:
+            assert isinstance(action, ApplyRuleAction), 'Invalid action [%s], only ApplyRule action is valid ' \
+                                                        'at the beginning of decoding'
+
+            self.tree = AbstractSyntaxTree(action.production)
+            self.update_frontier_info()
+        elif self.frontier_node:
+            if isinstance(self.frontier_field.type, ASDLCompositeType):
+                if isinstance(action, ApplyRuleAction):
+                    field_value = AbstractSyntaxTree(action.production)
+                    field_value.created_time = self.t
+                    self.frontier_field.add_value(field_value)
+                    self.update_frontier_info()
+                elif isinstance(action, ReduceAction):
+                    assert self.frontier_field.cardinality in ('optional', 'multiple'), 'Reduce action can only be ' \
+                                                                                        'applied on field with multiple ' \
+                                                                                        'cardinality'
+                    self.frontier_field.set_finish()
+                    self.update_frontier_info()
+                else:
+                    raise ValueError('Invalid action [%s] on field [%s]' % (action, self.frontier_field))
+            else:  # fill in a primitive field
+                if isinstance(action, GenTokenAction):
+                    # only field of type string requires termination signal </primitive>
+                    end_primitive = False
+                    if self.frontier_field.type.name == 'string':
+                        if action.is_stop_signal():
+                            self.frontier_field.add_value(' '.join(self._value_buffer))
+                            self._value_buffer = []
+
+                            end_primitive = True
+                        else:
+                            self._value_buffer.append(action.token)
+                    else:
+                        self.frontier_field.add_value(action.token)
+                        end_primitive = True
+
+                    if end_primitive and self.frontier_field.cardinality in ('single', 'optional'):
+                        self.frontier_field.set_finish()
+                        self.update_frontier_info()
+
+                elif isinstance(action, ReduceAction):
+                    assert self.frontier_field.cardinality in ('optional', 'multiple'), 'Reduce action can only be ' \
+                                                                                        'applied on field with multiple ' \
+                                                                                        'cardinality'
+                    self.frontier_field.set_finish()
+                    self.update_frontier_info()
+                else:
+                    raise ValueError('Can only invoke GenToken or Reduce actions on primitive fields')
+
+        self.t += 1
+        self.actions.append(action)
+
+    def update_frontier_info(self):
+        def _find_frontier_node_and_field(tree_node):
+            if tree_node:
+                for field in tree_node.fields:
+                    # if it's an intermediate node, check its children
+                    if isinstance(field.type, ASDLCompositeType) and field.value:
+                        if field.cardinality in ('single', 'optional'): iter_values = [field.value]
+                        else: iter_values = field.value
+
+                        for child_node in iter_values:
+                            result = _find_frontier_node_and_field(child_node)
+                            if result: return result
+
+                    # now all its possible children are checked
+                    if not field.finished:
+                        return tree_node, field
+
+                return None
+            else: return None
+
+        frontier_info = _find_frontier_node_and_field(self.tree)
+        if frontier_info:
+            self.frontier_node, self.frontier_field = frontier_info
+
+    @property
+    def completed(self):
+        return self.tree and self.frontier_field is None
@@ -0,0 +1,313 @@
+# coding=utf-8
+
+from __future__ import print_function
+
+import re
+import cPickle as pickle
+import ast
+import astor
+import nltk
+import sys
+
+import numpy as np
+
+from asdl.asdl_ast import RealizedField
+from asdl.lang.py.py_asdl_helper import python_ast_to_asdl_ast, asdl_ast_to_python_ast
+from asdl.lang.py.py_transition_system import PythonTransitionSystem
+from asdl.hypothesis import *
+
+from components.action_info import ActionInfo
+
+p_elif = re.compile(r'^elif\s?')
+p_else = re.compile(r'^else\s?')
+p_try = re.compile(r'^try\s?')
+p_except = re.compile(r'^except\s?')
+p_finally = re.compile(r'^finally\s?')
+p_decorator = re.compile(r'^@.*')
+
+QUOTED_STRING_RE = re.compile(r"(?P<quote>['\"])(?P<string>.*?)(?<!\\)(?P=quote)")
+
+
+class Django(object):
+    @staticmethod
+    def canonicalize_code(code):
+        if p_elif.match(code):
+            code = 'if True: pass\n' + code
+
+        if p_else.match(code):
+            code = 'if True: pass\n' + code
+
+        if p_try.match(code):
+            code = code + 'pass\nexcept: pass'
+        elif p_except.match(code):
+            code = 'try: pass\n' + code
+        elif p_finally.match(code):
+            code = 'try: pass\n' + code
+
+        if p_decorator.match(code):
+            code = code + '\ndef dummy(): pass'
+
+        if code[-1] == ':':
+            code = code + 'pass'
+
+        return code
+
+    @staticmethod
+    def canonicalize_query(query):
+        """
+        canonicalize the query, replace strings to a special place holder
+        """
+        str_count = 0
+        str_map = dict()
+
+        matches = QUOTED_STRING_RE.findall(query)
+        # de-duplicate
+        cur_replaced_strs = set()
+        for match in matches:
+            # If one or more groups are present in the pattern,
+            # it returns a list of groups
+            quote = match[0]
+            str_literal = quote + match[1] + quote
+
+            if str_literal in cur_replaced_strs:
+                continue
+
+            # FIXME: substitute the ' % s ' with
+            if str_literal in ['\'%s\'', '\"%s\"']:
+                continue
+
+            str_repr = '_STR:%d_' % str_count
+            str_map[str_literal] = str_repr
+
+            query = query.replace(str_literal, str_repr)
+
+            str_count += 1
+            cur_replaced_strs.add(str_literal)
+
+        # tokenize
+        query_tokens = nltk.word_tokenize(query)
+
+        new_query_tokens = []
+        # break up function calls like foo.bar.func
+        for token in query_tokens:
+            new_query_tokens.append(token)
+            i = token.find('.')
+            if 0 < i < len(token) - 1:
+                new_tokens = ['['] + token.replace('.', ' . ').split(' ') + [']']
+                new_query_tokens.extend(new_tokens)
+
+        query = ' '.join(new_query_tokens)
+
+        return query, str_map
+
+    @staticmethod
+    def canonicalize_example(query, code):
+
+        canonical_query, str_map = Django.canonicalize_query(query)
+        query_tokens = canonical_query.split(' ')
+        canonical_code = code
+
+        for str_literal, str_repr in str_map.iteritems():
+            canonical_code = canonical_code.replace(str_literal, '\'' + str_repr + '\'')
+
+        canonical_code = Django.canonicalize_code(canonical_code)
+
+        # sanity check
+        try:
+            gold_ast_tree = ast.parse(canonical_code).body[0]
+        except:
+            print('error!')
+            canonical_code = Django.canonicalize_code(code)
+            gold_ast_tree = ast.parse(canonical_code).body[0]
+            str_map = {}
+
+        # parse_tree = python_ast_to_asdl_ast(gold_ast_tree, grammar)
+        # gold_source = astor.to_source(gold_ast_tree)
+        # ast_tree = asdl_ast_to_python_ast(parse_tree, grammar)
+        # source = astor.to_source(ast_tree)
+
+        # assert gold_source == source, 'sanity check fails: gold=[%s], actual=[%s]' % (gold_source, source)
+        #
+        # # action check
+        # parser = PythonTransitionSystem(grammar)
+        # actions = parser.get_actions(parse_tree)
+        #
+        # hyp = Hypothesis()
+        # for action in actions:
+        #     assert action.__class__ in parser.get_valid_continuation_types(hyp)
+        #     if isinstance(action, ApplyRuleAction):
+        #         assert action in parser.get_valid_continuations(hyp)
+        #     hyp.apply_action(action)
+        #
+        # src_from_hyp = astor.to_source(asdl_ast_to_python_ast(hyp.tree, grammar))
+        # assert src_from_hyp == gold_source
+
+        return query_tokens, canonical_code, str_map
+
+    @staticmethod
+    def parse_django_dataset(annot_file, code_file, asdl_file_path, MAX_QUERY_LENGTH=70):
+        asdl_text = open(asdl_file_path).read()
+        grammar = ASDLGrammar.from_text(asdl_text)
+        transition_system = PythonTransitionSystem(grammar)
+
+        loaded_examples = []
+
+        from components.vocab import Vocab, VocabEntry
+        from components.dataset import Example
+
+        for idx, (src_query, tgt_code) in enumerate(zip(open(annot_file), open(code_file))):
+            src_query = src_query.strip()
+            tgt_code = tgt_code.strip()
+
+            src_query_tokens, tgt_canonical_code, str_map = Django.canonicalize_example(src_query, tgt_code)
+            python_ast = ast.parse(tgt_canonical_code).body[0]
+            gold_source = astor.to_source(python_ast)
+            tgt_ast = python_ast_to_asdl_ast(python_ast, grammar)
+            tgt_actions = transition_system.get_actions(tgt_ast)
+
+            # sanity check
+            hyp = Hypothesis()
+            for action in tgt_actions:
+                assert action.__class__ in transition_system.get_valid_continuation_types(hyp)
+                if isinstance(action, ApplyRuleAction):
+                    assert action in transition_system.get_valid_continuating_productions(hyp)
+                hyp.apply_action(action)
+
+            src_from_hyp = astor.to_source(asdl_ast_to_python_ast(hyp.tree, grammar))
+            assert src_from_hyp == gold_source
+
+            loaded_examples.append({'src_query_tokens': src_query_tokens,
+                                    'tgt_canonical_code': tgt_canonical_code,
+                                    'tgt_ast': tgt_ast,
+                                    'tgt_actions': tgt_actions,
+                                    'raw_code': tgt_code, 'str_map': str_map})
+
+            print('first pass, processed %d' % idx, file=sys.stderr)
+
+        src_vocab = VocabEntry.from_corpus([e['src_query_tokens'] for e in loaded_examples], size=5000, freq_cutoff=3)
+
+        primitive_tokens = [map(lambda a: a.token,
+                               filter(lambda a: isinstance(a, GenTokenAction), e['tgt_actions']))
+                            for e in loaded_examples]
+
+        primitive_vocab = VocabEntry.from_corpus(primitive_tokens, size=5000, freq_cutoff=3)
+        assert '_STR:0_' in primitive_vocab
+
+        vocab = Vocab(source=src_vocab, primitive=primitive_vocab)
+        print('generated vocabulary %s' % repr(vocab), file=sys.stderr)
+
+        train_examples = []
+        dev_examples = []
+        test_examples = []
+
+        action_len = []
+
+        for idx, e in enumerate(loaded_examples):
+            src_query_tokens = e['src_query_tokens'][:MAX_QUERY_LENGTH]
+            tgt_actions = e['tgt_actions']
+            tgt_action_infos = Django.get_action_infos(src_query_tokens, tgt_actions)
+
+            example = Example(idx=idx,
+                              src_sent=src_query_tokens,
+                              tgt_actions=tgt_action_infos,
+                              tgt_code=e['tgt_canonical_code'],
+                              tgt_ast=e['tgt_ast'],
+                              meta={'raw_code': e['raw_code'], 'str_map': e['str_map']})
+
+            print('second pass, processed %d' % idx, file=sys.stderr)
+
+            action_len.append(len(tgt_action_infos))
+
+            # train, valid, test split
+            if 0 <= idx < 16000:
+                train_examples.append(example)
+            elif 16000 <= idx < 17000:
+                dev_examples.append(example)
+            else:
+                test_examples.append(example)
+
+        print('Max action len: %d' % max(action_len), file=sys.stderr)
+        print('Avg action len: %d' % np.average(action_len), file=sys.stderr)
+        print('Actions larger than 100: %d' % len(filter(lambda x: x > 100, action_len)), file=sys.stderr)
+
+        return (train_examples, dev_examples, test_examples), vocab
+
+    @staticmethod
+    def get_action_infos(src_query, tgt_actions):
+        action_infos = []
+        hyp = Hypothesis()
+        for t, action in enumerate(tgt_actions):
+            action_info = ActionInfo(action)
+            action_info.t = t
+            if hyp.frontier_node:
+                action_info.parent_t = hyp.frontier_node.created_time
+                action_info.frontier_prod = hyp.frontier_node.production
+                action_info.frontier_field = hyp.frontier_field.field
+
+            if isinstance(action, GenTokenAction):
+                try:
+                    tok_src_idx = src_query.index(str(action.token))
+                    action_info.copy_from_src = True
+                    action_info.src_token_position = tok_src_idx
+                except ValueError:
+                    pass
+
+            hyp.apply_action(action)
+            action_infos.append(action_info)
+
+        return action_infos
+
+    @staticmethod
+    def generate_django_dataset():
+        annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno'
+        code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
+
+        (train, dev, test), vocab = Django.parse_django_dataset(annot_file, code_file, 'asdl/lang/py/py_asdl.txt')
+
+        pickle.dump(train, open('data/django/train.bin', 'w'))
+        pickle.dump(dev, open('data/django/dev.bin', 'w'))
+        pickle.dump(test, open('data/django/test.bin', 'w'))
+        pickle.dump(vocab, open('data/django/vocab.bin', 'w'))
+
+    @staticmethod
+    def run():
+        asdl_text = open('asdl/lang/py/py_asdl.txt').read()
+        grammar = ASDLGrammar.from_text(asdl_text)
+
+        annot_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.anno'
+        code_file = '/Users/yinpengcheng/Research/SemanticParsing/CodeGeneration/en-django/all.code'
+
+        transition_system = PythonTransitionSystem(grammar)
+
+        for idx, (src_query, tgt_code) in enumerate(zip(open(annot_file), open(code_file))):
+            src_query = src_query.strip()
+            tgt_code = tgt_code.strip()
+
+            query_tokens, tgt_canonical_code, str_map = Django.canonicalize_example(src_query, tgt_code)
+            python_ast = ast.parse(tgt_canonical_code).body[0]
+            gold_source = astor.to_source(python_ast)
+            tgt_ast = python_ast_to_asdl_ast(python_ast, grammar)
+            tgt_actions = transition_system.get_actions(tgt_ast)
+
+            # sanity check
+            hyp = Hypothesis()
+            for action in tgt_actions:
+                assert action.__class__ in transition_system.get_valid_continuation_types(hyp)
+                if isinstance(action, ApplyRuleAction):
+                    assert action.production in transition_system.get_valid_continuating_productions(hyp)
+                hyp.apply_action(action)
+
+            src_from_hyp = astor.to_source(asdl_ast_to_python_ast(hyp.tree, grammar))
+            assert src_from_hyp == gold_source
+
+
+
+if __name__ == '__main__':
+    Django.run()
+    # f1 = Field('hahah', ASDLPrimitiveType('123'), 'single')
+    # rf1 = RealizedField(f1, value=123)
+    #
+    # # print(f1 == rf1)
+    # a = {f1: 1}
+    # print(a[rf1])
+    # Django.generate_django_dataset()
@@ -0,0 +1,99 @@
+# coding=utf-8
+
+import sys
+
+from asdl.asdl_ast import RealizedField, AbstractSyntaxTree
+
+
+# from https://stackoverflow.com/questions/15357422/python-determine-if-a-string-should-be-converted-into-int-or-float
+def isfloat(x):
+    try:
+        a = float(x)
+    except ValueError:
+        return False
+    else:
+        return True
+
+
+def isint(x):
+    try:
+        a = float(x)
+        b = int(a)
+    except ValueError:
+        return False
+    else:
+        return a == b
+
+
+def python_ast_to_asdl_ast(py_ast_node, grammar):
+    # node should be composite
+    py_node_name = type(py_ast_node).__name__
+    # assert py_node_name.startswith('_ast.')
+
+    production = grammar.get_prod_by_ctr_name(py_node_name)
+
+    fields = []
+    for field in production.fields:
+        field_value = getattr(py_ast_node, field.name)
+        asdl_field = RealizedField(field)
+        if field.cardinality == 'single' or field.cardinality == 'optional':
+            if field_value is not None:  # sometimes it could be 0
+                if grammar.is_composite_type(field.type):
+                    asdl_field.value = python_ast_to_asdl_ast(field_value, grammar)
+                else:
+                    asdl_field.value = field_value
+        else:
+            if field_value is not None:
+                vals = []
+                if grammar.is_composite_type(field.type):
+                    for val in field_value:
+                        child_node = python_ast_to_asdl_ast(val, grammar)
+                        vals.append(child_node)
+
+                    asdl_field.value = vals
+                else:
+                    asdl_field.value = str(field_value)
+
+        fields.append(asdl_field)
+
+    asdl_node = AbstractSyntaxTree(production, realized_fields=fields)
+
+    return asdl_node
+
+
+def asdl_ast_to_python_ast(asdl_ast_node, grammar):
+    py_node_type = getattr(sys.modules['ast'], asdl_ast_node.production.constructor.name)
+    py_ast_node = py_node_type()
+
+    for field in asdl_ast_node.fields:
+        # for composite node
+        field_value = None
+        if grammar.is_composite_type(field.type):
+            if field.value and field.cardinality == 'multiple':
+                field_value = []
+                for val in field.value:
+                    node = asdl_ast_to_python_ast(val, grammar)
+                    field_value.append(node)
+            elif field.value and field.cardinality in ('single', 'optional'):
+                field_value = asdl_ast_to_python_ast(field.value, grammar)
+        else:
+            # for primitive node
+            if field.type.name == 'object':
+                if isfloat(field.value):
+                    field_value = float(field.value)
+                elif isint(field.value):
+                    field_value = int(field.value)
+                else:
+                    raise ValueError('cannot convert [%s] to float or int' % field.value)
+            elif field.type.name == 'int':
+                field_value = int(field.value)
+            else:
+                field_value = field.value
+
+        # must set unused fields to default value...
+        if field_value is None and field.cardinality == 'multiple':
+            field_value = list()
+
+        setattr(py_ast_node, field.name, field_value)
+
+    return py_ast_node
@@ -0,0 +1,41 @@
+# coding=utf-8
+
+import ast
+
+from asdl.asdl import ASDLGrammar
+from asdl.lang.py.py_asdl_helper import *
+from asdl.lang.py.py_transition_system import *
+
+if __name__ == '__main__':
+    asdl_text = open('py_asdl.txt').read()
+    grammar = ASDLGrammar.from_text(asdl_text)
+    py_code = 'sorted(mydict, key=mydict.get, reverse=True, how="hahaha", sadf=0.3)'
+    #py_code = 'a = dict({a: None, b:False, s:"I love my mother", sd:124+3})'
+    #py_code = '1e10'
+    py_ast = ast.parse(py_code)
+    asdl_ast = python_ast_to_asdl_ast(py_ast.body[0], grammar)
+    py_ast_reconstructed = asdl_ast_to_python_ast(asdl_ast, grammar)
+
+    asdl_ast2 = asdl_ast.copy()
+    assert asdl_ast == asdl_ast2
+    del asdl_ast2
+
+    parser = PythonTransitionSystem(grammar)
+    actions = parser.get_actions(asdl_ast)
+
+    from asdl.hypothesis import *
+    hyp = Hypothesis()
+    for action in actions:
+        # assert action.__class__ in parser.get_valid_continuation_types(hyp)
+        # if isinstance(action, ApplyRuleAction):
+        #     assert action.production in grammar[hyp.frontier_field.type]
+        hyp.apply_action(action)
+
+    import astor
+    src1 = astor.to_source(py_ast)
+    src2 = astor.to_source(py_ast_reconstructed)
+    src3 = astor.to_source(asdl_ast_to_python_ast(hyp.tree, grammar))
+
+    print(src3)
+    assert src1 == src3
+    pass
@@ -0,0 +1,26 @@
+# coding=utf-8
+
+from asdl.transition_system import TransitionSystem, GenTokenAction
+
+
+class PythonTransitionSystem(TransitionSystem):
+    def get_primitive_field_actions(self, realized_field):
+        actions = []
+        if realized_field.value is not None:
+            if realized_field.cardinality == 'multiple':  # expr -> Global(identifier* names)
+                field_values = realized_field.value
+            else:
+                field_values = [realized_field.value]
+
+            tokens = []
+            if realized_field.type.name == 'string':
+                for field_val in field_values:
+                    tokens.extend(field_val.split(' ') + ['</primitive>'])
+            else:
+                for field_val in field_values:
+                    tokens.append(field_val)
+
+            for tok in tokens:
+                actions.append(GenTokenAction(tok))
+
+        return actions
@@ -0,0 +1,242 @@
+# coding=utf-8
+
+from cStringIO import StringIO
+from collections import Iterable
+
+from asdl import *
+from asdl_ast import AbstractSyntaxTree, RealizedField
+
+
+def parse_lambda_expr_helper(s, offset):
+    if s[offset] != '(':
+        name = ''
+        while offset < len(s) and s[offset] != ' ':
+            name += s[offset]
+            offset += 1
+
+        node = Node(name)
+        return node, offset
+    else:
+        # it's a sub-tree
+        offset += 2
+        name = ''
+        while s[offset] != ' ':
+            name += s[offset]
+            offset += 1
+
+        node = Node(name)
+        # extract its child nodes
+
+        while True:
+            if s[offset] != ' ':
+                raise ValueError('malformed string: node should have either had a '
+                                 'close paren or a space at position %d' % offset)
+
+            offset += 1
+            if s[offset] == ')':
+                offset += 1
+                return node, offset
+            else:
+                child_node, offset = parse_lambda_expr_helper(s, offset)
+
+            node.add_child(child_node)
+
+
+def parse_lambda_expr(s):
+    return parse_lambda_expr_helper(s, 0)[0]
+
+
+class Node(object):
+    def __init__(self, name, children=None):
+        self.name = name
+        self.parent = None
+        self.children = list()
+        if children:
+            if isinstance(children, Iterable):
+                for child in children:
+                    self.add_child(child)
+            elif isinstance(children, Node):
+                self.add_child(children)
+            else: raise ValueError('Wrong type for child nodes')
+
+    def add_child(self, child):
+        child.parent = self
+        self.children.append(child)
+
+    def __hash__(self):
+        code = hash(self.name)
+
+        for child in self.children:
+            code = code * 37 + hash(child)
+
+        return code
+
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return False
+
+        if self.name != other.name:
+            return False
+
+        if len(self.children) != len(other.children):
+            return False
+
+        if self.name == 'and' or self.name == 'or':
+            return sorted(self.children, key=lambda x: x.name) == sorted(other.children, key=lambda x: x.name)
+        else:
+            return self.children == other.children
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        return 'Node[%s, %d children]' % (self.name, len(self.children))
+
+    @property
+    def is_leaf(self):
+        return len(self.children) == 0
+
+    def to_string(self, sb=None):
+        is_root = False
+        if sb is None:
+            is_root = True
+            sb = StringIO()
+
+        if self.is_leaf:
+            sb.write(self.name)
+        else:
+            sb.write('( ')
+            sb.write(self.name)
+
+            for child in self.children:
+                sb.write(' ')
+                child.to_string(sb)
+
+            sb.write(' )')
+
+        if is_root:
+            return sb.getvalue()
+
+
+def logical_form_to_ast(grammar, lf_node):
+    if lf_node.name == 'lambda':
+        # expr -> Lambda(var variable, var_type type, expr body)
+        prod = grammar[('expr', 'Lambda')]
+
+        var_node = lf_node.children[0]
+        var_field = RealizedField(prod['variable'], var_node.name)
+
+        var_type_node = lf_node.children[1]
+        var_type_field = RealizedField(prod['type'], var_type_node.name)
+
+        body_node = lf_node.children[2]
+        body_ast_node = logical_form_to_ast(grammar, body_node)  # of type expr
+        body_field = RealizedField(prod['body'], body_ast_node)
+
+        ast_node = AbstractSyntaxTree(prod,
+                                      [var_field, var_type_field, body_field])
+    elif lf_node.name == 'argmax' or lf_node.name == 'argmin':
+        # expr -> Argmax(var variable, expr domain, expr body)
+        if lf_node.name == 'argmax':
+            prod = grammar[('expr', 'Argmax')]
+        else:
+            prod = grammar[('expr', 'Argmin')]
+
+        var_node = lf_node.children[0]
+        var_field = RealizedField(prod['variable'], var_node.name)
+
+        domain_node = lf_node.children[2]
+        domain_ast_node = logical_form_to_ast(grammar, domain_node)
+        domain_field = RealizedField(prod['domain'], domain_ast_node)
+
+        body_node = lf_node.children[1]
+        body_ast_node = logical_form_to_ast(grammar, body_node)
+        body_field = RealizedField(prod['body'], body_ast_node)
+
+        ast_node = AbstractSyntaxTree(prod,
+                                      [var_field, domain_field, body_field])
+    elif lf_node.name == 'and' or lf_node.name == 'or':
+        # expr -> And(expr* arguments) | Or(expr* arguments)
+        if lf_node.name == 'and':
+            prod = grammar[('expr', 'And')]
+        else:
+            prod = grammar[('expr', 'Or')]
+
+        arg_ast_nodes = []
+        for arg_node in lf_node.children:
+            arg_ast_node = logical_form_to_ast(grammar, arg_node)
+            arg_ast_nodes.append(arg_ast_node)
+
+        ast_node = AbstractSyntaxTree(prod.constructor.name,
+                                      RealizedField(prod['arguments'], arg_ast_nodes))
+    elif lf_node.name == '>' or lf_node.name == '=' or lf_node.name == '<':
+        # expr -> Compare(cmp_op op, expr left, expr right)
+        prod = grammar[('expr', 'Compare')]
+        op_name = 'GreaterThan' if lf_node.name == '>' else 'Equal' if lf_node.name == '=' else 'LessThan'
+        op_field = RealizedField(prod['op'], AbstractSyntaxTree(grammar[('Compare', op_name)]))
+
+        left_node = lf_node.children[0]
+        left_ast_node = logical_form_to_ast(grammar, left_node)
+        left_field = RealizedField(prod['left'], left_ast_node)
+
+        right_node = lf_node.children[1]
+        right_ast_node = logical_form_to_ast(grammar, right_node)
+        right_field = RealizedField(prod['right'], right_ast_node)
+
+        ast_node = AbstractSyntaxTree(prod,
+                                      [op_field, left_field, right_field])
+    elif lf_node.name in ['flight', 'airline', 'from', 'to', 'day', 'month', 'arrival_time',
+                          'nonstop', 'has_meal', 'round_trip']:
+        # expr -> Apply(pred predicate, expr* arguments)
+        prod = grammar[('expr', 'Apply')]
+        arg_ast_nodes = []
+        for arg_node in lf_node.children:
+            arg_ast_node = logical_form_to_ast(grammar, arg_node)
+            arg_ast_nodes.append(arg_ast_node)
+
+        ast_node = AbstractSyntaxTree(prod,
+                                      RealizedField(prod['arguments'], arg_ast_nodes))
+    elif lf_node.name.startswith('$'):
+        prod = grammar[('expr', 'Variable')]
+        ast_node = AbstractSyntaxTree(prod,
+                                      RealizedField(prod['variable'], lf_node.name))
+    elif ':cl' in lf_node.name or ':pd' in lf_node.name or lf_node.name in ['ci0', 'ci1', 'ti0', 'ti1', 'da0', 'da1', 'al0']:
+        prod = grammar[('expr', 'Entity')]
+        ast_node = AbstractSyntaxTree(prod,
+                                      RealizedField(prod['entity'], lf_node.name))
+    else:
+        raise NotImplementedError
+
+    return ast_node
+
+
+if __name__ == '__main__':
+    asdl_desc = """
+    var, ent, num, var_type
+
+    expr = Variable(var variable)
+    | Entity(ent entity)
+    | Number(num number)
+    | Apply(pred predicate, expr* arguments)
+    | Argmax(var variable, expr domain, expr body)
+    | Argmin(var variable, expr domain, expr body)
+    | Count(var variable, expr body)
+    | Exists(var variable, expr body)
+    | Lambda(var variable, var_type type, expr body)
+    | Max(var variable, expr body)
+    | Min(var variable, expr body)
+    | Sum(var variable, expr domain, expr body)
+    | The(var variable, expr body)
+    | Not(expr argument)
+    | And(expr* arguments)
+    | Or(expr* arguments)
+    | Compare(cmp_op op, expr left, expr right)
+
+    cmp_op = GreaterThan | Equal | LessThan
+    """
+
+    grammar = ASDLGrammar.from_text(asdl_desc)
+    # lf = parse_lambda_expr('( lambda $0 e ( and ( flight $0 ) ( airline $0 al0 ) ( from $0 ci0 ) ( to $0 ci1 ) ) )')
+    lf = parse_lambda_expr('al0')
+    ast_tree = logical_form_to_ast(grammar, lf)
+    pass
@@ -0,0 +1,109 @@
+# coding=utf-8
+
+
+class Action(object):
+    pass
+
+
+class ApplyRuleAction(Action):
+    def __init__(self, production):
+        self.production = production
+
+    def __hash__(self):
+        return hash(self.production)
+
+    def __eq__(self, other):
+        return isinstance(other, ApplyRuleAction) and self.production == other.production
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+class GenTokenAction(Action):
+    def __init__(self, token):
+        self.token = token
+
+    def is_stop_signal(self):
+        return self.token == '</primitive>'
+
+
+class ReduceAction(Action):
+   pass
+
+
+class TransitionSystem(object):
+    def __init__(self, grammar):
+        self.grammar = grammar
+
+    def get_actions(self, asdl_ast):
+        """
+        generate action sequence given the ASDL Syntax Tree
+        """
+
+        actions = []
+
+        parent_action = ApplyRuleAction(asdl_ast.production)
+        actions.append(parent_action)
+
+        for field in asdl_ast.fields:
+            # is a composite field
+            if self.grammar.is_composite_type(field.type):
+                if field.cardinality == 'single':
+                    field_actions = self.get_actions(field.value)
+                else:
+                    field_actions = []
+
+                    if field.value is not None:
+                        if field.cardinality == 'multiple':
+                            for val in field.value:
+                                cur_child_actions = self.get_actions(val)
+                                field_actions.extend(cur_child_actions)
+                        elif field.cardinality == 'optional':
+                            field_actions = self.get_actions(field.value)
+
+                    # if an optional field is filled, then do not need Reduce action
+                    if field.cardinality == 'multiple' or field.cardinality == 'optional' and not field_actions:
+                        field_actions.append(ReduceAction())
+            else:  # is a primitive field
+                field_actions = self.get_primitive_field_actions(field)
+
+                # if an optional field is filled, then do not need Reduce action
+                if field.cardinality == 'multiple' or field.cardinality == 'optional' and not field_actions:
+                    # reduce action
+                    field_actions.append(ReduceAction())
+
+            actions.extend(field_actions)
+
+        return actions
+
+    def get_primitive_field_actions(self, realized_field):
+        raise NotImplementedError
+
+    def get_valid_continuation_types(self, hyp):
+        if hyp.tree:
+            if self.grammar.is_composite_type(hyp.frontier_field.type):
+                if hyp.frontier_field.cardinality == 'single':
+                    return ApplyRuleAction,
+                else:  # optional, multiple
+                    return ApplyRuleAction, ReduceAction
+            else:
+                if hyp.frontier_field.cardinality == 'single':
+                    return GenTokenAction,
+                elif hyp.frontier_field.cardinality == 'optional':
+                    if hyp._value_buffer:
+                        return GenTokenAction,
+                    else:
+                        return GenTokenAction, ReduceAction
+                else:
+                    return GenTokenAction, ReduceAction
+        else:
+            return ApplyRuleAction,
+
+    def get_valid_continuating_productions(self, hyp):
+        if hyp.tree:
+            if self.grammar.is_composite_type(hyp.frontier_field.type):
+                return self.grammar[hyp.frontier_field.type]
+            else:
+                raise ValueError
+        else:
+            return self.grammar[self.grammar.root_type]
@@ -0,0 +1,9 @@
+# coding=utf-8
+import re
+
+
+def remove_comment(text):
+    text = re.sub(re.compile("#.*"), "", text)
+    text = '\n'.join(filter(lambda x: x, text.split('\n')))
+
+    return text
@@ -0,0 +1,16 @@
+# coding=utf-8
+
+
+class ActionInfo(object):
+    """sufficient statistics for making a prediction of an action at a time step"""
+
+    def __init__(self, action):
+        self.t = 0
+        self.parent_t = -1
+        self.action = action
+        self.frontier_prod = None
+        self.frontier_field = None
+
+        # for GenToken actions only
+        self.copy_from_src = False
+        self.src_token_position = -1
@@ -0,0 +1,183 @@
+# coding=utf-8
+
+import torch
+import numpy as np
+import cPickle as pickle
+from torch.autograd import Variable
+
+from asdl.transition_system import ApplyRuleAction, ReduceAction
+from components.utils import cached_property
+
+from model import nn_utils
+
+
+class Dataset(object):
+    def __init__(self, examples):
+        self.examples = examples
+
+    @property
+    def all_source(self):
+        return [e.src_sent for e in self.examples]
+
+    @property
+    def all_targets(self):
+        return [e.tgt_code for e in self.examples]
+
+    @staticmethod
+    def from_bin_file(file_path):
+        examples = pickle.load(open(file_path, 'rb'))
+        return Dataset(examples)
+
+    def batch_iter(self, batch_size, shuffle=False):
+        index_arr = np.arange(len(self.examples))
+        if shuffle:
+            np.random.shuffle(index_arr)
+
+        batch_num = int(np.ceil(len(self.examples) / float(batch_size)))
+        for batch_id in xrange(batch_num):
+            batch_ids = index_arr[batch_size * batch_id: batch_size * (batch_id + 1)]
+            batch_examples = [self.examples[i] for i in batch_ids]
+            batch_examples.sort(key=lambda e: -len(e.src_sent))
+
+            yield batch_examples
+
+    def __len__(self):
+        return len(self.examples)
+
+
+class Example(object):
+    def __init__(self, src_sent, tgt_actions, tgt_code, tgt_ast, idx=0, meta=None):
+        self.src_sent = src_sent
+        self.tgt_code = tgt_code
+        self.tgt_ast = tgt_ast
+        self.tgt_actions = tgt_actions
+
+        self.idx = idx
+        self.meta = meta
+
+
+class Batch(object):
+    def __init__(self, examples, grammar, vocab, cuda=False):
+        self.examples = examples
+        self.max_action_num = max(len(e.tgt_actions) for e in self.examples)
+
+        self.src_sents = [e.src_sent for e in self.examples]
+        self.src_sents_len = [len(e.src_sent) for e in self.examples]
+
+        self.grammar = grammar
+        self.vocab = vocab
+        self.cuda = cuda
+
+        self.init_index_tensors()
+
+    def __len__(self):
+        return len(self.examples)
+
+    def get_frontier_field_idx(self, t):
+        ids = []
+        for e in self.examples:
+            if t < len(e.tgt_actions):
+                ids.append(self.grammar.field2id[e.tgt_actions[t].frontier_field])
+                # assert self.grammar.id2field[ids[-1]] == e.tgt_actions[t].frontier_field
+            else:
+                ids.append(0)
+
+        return Variable(torch.cuda.LongTensor(ids)) if self.cuda else Variable(torch.LongTensor(ids))
+
+    def get_frontier_prod_idx(self, t):
+        ids = []
+        for e in self.examples:
+            if t < len(e.tgt_actions):
+                ids.append(self.grammar.prod2id[e.tgt_actions[t].frontier_prod])
+                # assert self.grammar.id2prod[ids[-1]] == e.tgt_actions[t].frontier_prod
+            else:
+                ids.append(0)
+
+        return Variable(torch.cuda.LongTensor(ids)) if self.cuda else Variable(torch.LongTensor(ids))
+
+    def get_frontier_field_type_idx(self, t):
+        ids = []
+        for e in self.examples:
+            if t < len(e.tgt_actions):
+                ids.append(self.grammar.type2id[e.tgt_actions[t].frontier_field.type])
+                # assert self.grammar.id2type[ids[-1]] == e.tgt_actions[t].frontier_field.type
+            else:
+                ids.append(0)
+
+        return Variable(torch.cuda.LongTensor(ids)) if self.cuda else Variable(torch.LongTensor(ids))
+
+    def init_index_tensors(self):
+        self.apply_rule_idx_matrix = []
+        self.apply_rule_mask = []
+        self.primitive_idx_matrix = []
+        self.gen_token_mask = []
+        self.primitive_copy_pos_matrix = []
+        self.primitive_copy_mask = []
+
+        for t in xrange(self.max_action_num):
+            app_rule_idx_row = []
+            app_rule_mask_row = []
+            token_row = []
+            gen_token_mask_row = []
+            copy_pos_row = []
+            copy_mask_row = []
+
+            for e in self.examples:
+                app_rule_idx = app_rule_mask = token_idx = gen_token_mask = copy_pos = copy_mask = 0
+                if t < len(e.tgt_actions):
+                    action = e.tgt_actions[t].action
+                    action_info = e.tgt_actions[t]
+                    if isinstance(action, ApplyRuleAction):
+                        app_rule_idx = self.grammar.prod2id[action.production]
+                        # assert self.grammar.id2prod[app_rule_idx] == action.production
+                        app_rule_mask = 1
+                    elif isinstance(action, ReduceAction):
+                        app_rule_idx = len(self.grammar)
+                        app_rule_mask = 1
+                    else:
+                        token_idx = self.vocab.primitive[action.token]
+                        # cannot copy, only generation
+                        # could be unk!
+                        if not action_info.copy_from_src:
+                            gen_token_mask = 1
+                        else:  # copy
+                            copy_mask = 1
+                            copy_pos = action_info.src_token_position
+                            if token_idx != self.vocab.primitive.unk_id:
+                                # both copy and generate from vocabulary
+                                gen_token_mask = 1
+
+                app_rule_idx_row.append(app_rule_idx)
+                app_rule_mask_row.append(app_rule_mask)
+
+                token_row.append(token_idx)
+                gen_token_mask_row.append(gen_token_mask)
+                copy_pos_row.append(copy_pos)
+                copy_mask_row.append(copy_mask)
+
+            self.apply_rule_idx_matrix.append(app_rule_idx_row)
+            self.apply_rule_mask.append(app_rule_mask_row)
+
+            self.primitive_idx_matrix.append(token_row)
+            self.gen_token_mask.append(gen_token_mask_row)
+
+            self.primitive_copy_pos_matrix.append(copy_pos_row)
+            self.primitive_copy_mask.append(copy_mask_row)
+
+        T = torch.cuda if self.cuda else torch
+        self.apply_rule_idx_matrix = Variable(T.LongTensor(self.apply_rule_idx_matrix))
+        self.apply_rule_mask = Variable(T.FloatTensor(self.apply_rule_mask))
+        self.primitive_idx_matrix = Variable(T.LongTensor(self.primitive_idx_matrix))
+        self.gen_token_mask = Variable(T.FloatTensor(self.gen_token_mask))
+        self.primitive_copy_pos_matrix = Variable(T.LongTensor(self.primitive_copy_pos_matrix))
+        self.primitive_copy_mask = Variable(T.FloatTensor(self.primitive_copy_mask))
+
+    @cached_property
+    def src_sents_var(self):
+        return nn_utils.to_input_variable(self.src_sents, self.vocab.source,
+                                          cuda=self.cuda)
+
+    @cached_property
+    def src_token_mask(self):
+        return nn_utils.length_array_to_mask_tensor(self.src_sents_len,
+                                                    cuda=self.cuda)
@@ -0,0 +1,20 @@
+# coding=utf-8
+
+
+class cached_property(object):
+    """ A property that is only computed once per instance and then replaces
+        itself with an ordinary attribute. Deleting the attribute resets the
+        property.
+
+        Source: https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76
+        """
+
+    def __init__(self, func):
+        self.__doc__ = getattr(func, '__doc__')
+        self.func = func
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            return self
+        value = obj.__dict__[self.func.__name__] = self.func(obj)
+        return value
@@ -0,0 +1,82 @@
+# coding=utf-8
+
+from __future__ import print_function
+import argparse
+from collections import Counter
+from itertools import chain
+import torch
+
+class VocabEntry(object):
+    def __init__(self):
+        self.word2id = dict()
+        self.unk_id = 3
+        self.word2id['<pad>'] = 0
+        self.word2id['<s>'] = 1
+        self.word2id['</s>'] = 2
+        self.word2id['<unk>'] = 3
+
+        self.id2word = {v: k for k, v in self.word2id.iteritems()}
+
+    def __getitem__(self, word):
+        return self.word2id.get(word, self.unk_id)
+
+    def __contains__(self, word):
+        return word in self.word2id
+
+    def __setitem__(self, key, value):
+        raise ValueError('vocabulary is readonly')
+
+    def __len__(self):
+        return len(self.word2id)
+
+    def __repr__(self):
+        return 'Vocabulary[size=%d]' % len(self)
+
+    def id2word(self, wid):
+        return self.id2word[wid]
+
+    def add(self, word):
+        if word not in self:
+            wid = self.word2id[word] = len(self)
+            self.id2word[wid] = word
+            return wid
+        else:
+            return self[word]
+
+    def is_unk(self, word):
+        return word not in self
+
+    @staticmethod
+    def from_corpus(corpus, size, freq_cutoff=0):
+        vocab_entry = VocabEntry()
+
+        word_freq = Counter(chain(*corpus))
+        non_singletons = [w for w in word_freq if word_freq[w] > 1]
+        print('number of word types: %d, number of word types w/ frequency > 1: %d' % (len(word_freq),
+                                                                                       len(non_singletons)))
+
+        top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get)[:size]
+
+        for word in top_k_words:
+            if len(vocab_entry) < size:
+                if word_freq[word] >= freq_cutoff:
+                    vocab_entry.add(word)
+
+        return vocab_entry
+
+
+class Vocab(object):
+    def __init__(self, **kwargs):
+        self.entries = []
+        for key, item in kwargs.iteritems():
+            assert isinstance(item, VocabEntry)
+            self.__setattr__(key, item)
+
+            self.entries.append(key)
+
+    def __repr__(self):
+        return 'Vocab(%s)' % (', '.join('%s %swords' % (entry, getattr(self, entry)) for entry in self.entries))
+
+
+if __name__ == '__main__':
+    raise NotImplementedError
@@ -0,0 +1,136 @@
+# coding=utf-8
+from __future__ import print_function
+
+import argparse
+import cPickle as pickle
+import numpy as np
+import time
+import math
+
+import sys
+import torch
+
+from asdl.asdl import ASDLGrammar
+from asdl.lang.py.py_transition_system import PythonTransitionSystem
+from components.dataset import Dataset
+
+from model.parser import Parser
+
+
+def init_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--seed', default=5783287, type=int, help='random seed')
+    parser.add_argument('--cuda', action='store_true', default=False, help='use gpu')
+    parser.add_argument('--mode', choices=['train', 'train_semi', 'test', 'debug_ls'], default='train', help='run mode')
+
+    parser.add_argument('--batch_size', default=10, type=int, help='batch size')
+    parser.add_argument('--beam_size', default=5, type=int, help='beam size for beam search')
+    parser.add_argument('--sample_size', default=5, type=int, help='sample size')
+    parser.add_argument('--embed_size', default=128, type=int, help='size of word embeddings')
+    parser.add_argument('--action_embed_size', default=128, type=int, help='size of word embeddings')
+    parser.add_argument('--field_embed_size', default=64, type=int, help='size of word embeddings')
+    parser.add_argument('--type_embed_size', default=64, type=int, help='size of word embeddings')
+    parser.add_argument('--ptrnet_hidden_dim', default=32, type=int)
+    parser.add_argument('--hidden_size', default=256, type=int, help='size of LSTM hidden states')
+    parser.add_argument('--dropout', default=0., type=float, help='dropout rate')
+    parser.add_argument('--decoder_word_dropout', default=0.3, type=float, help='word dropout on decoder')
+    parser.add_argument('--kl_anneal', default=False, action='store_true')
+    parser.add_argument('--alpha', default=0.1, type=float)
+
+    parser.add_argument('--asdl_file', type=str)
+    parser.add_argument('--vocab', type=str, help='path of the serialized vocabulary')
+    parser.add_argument('--train_src', type=str, help='path to the training source file')
+    parser.add_argument('--unlabeled_src', type=str, help='path to the training source file')
+    parser.add_argument('--unlabeled_tgt', type=str, default=None, help='path to the target file')
+    parser.add_argument('--train_file', type=str, help='path to the training target file')
+    parser.add_argument('--dev_file', type=str, help='path to the dev source file')
+    parser.add_argument('--test_file', type=str, help='path to the test target file')
+    parser.add_argument('--prior_lm_path', type=str, help='path to the prior LM')
+
+    # semi-supervised learning arguments
+    parser.add_argument('--begin_semisup_after_dev_acc', type=float, default=0., help='begin semi-supervised learning after'
+                                                                                    'we have reached certain dev performance')
+
+    parser.add_argument('--decode_max_time_step', default=80, type=int, help='maximum number of time steps used '
+                                                                              'in decoding and sampling')
+    parser.add_argument('--unsup_loss_weight', default=1., type=float, help='loss of unsupervised learning weight')
+
+    parser.add_argument('--valid_metric', default='sp_acc', choices=['nlg_bleu', 'sp_acc'],
+                        help='metric used for validation')
+    parser.add_argument('--log_every', default=10, type=int, help='every n iterations to log training statistics')
+    parser.add_argument('--load_model', default=None, type=str, help='load a pre-trained model')
+    parser.add_argument('--save_to', default='model', type=str, help='save trained model to')
+    parser.add_argument('--save_decode_to', default=None, type=str, help='save decoding results to file')
+    parser.add_argument('--patience', default=5, type=int, help='training patience')
+    parser.add_argument('--max_num_trial', default=10, type=int)
+    parser.add_argument('--uniform_init', default=None, type=float,
+                        help='if specified, use uniform initialization for all parameters')
+    parser.add_argument('--clip_grad', default=5., type=float, help='clip gradients')
+    parser.add_argument('--max_epoch', default=-1, type=int, help='maximum number of training epoches')
+    parser.add_argument('--lr', default=0.001, type=float, help='learning rate')
+    parser.add_argument('--lr_decay', default=0.5, type=float,
+                        help='decay learning rate if the validation performance drops')
+    parser.add_argument('--lr_decay_after_epoch', default=5, type=int)
+    parser.add_argument('--reset_optimizer', action='store_true', default=False)
+
+    parser.add_argument('--train_opt', default="reinforce", type=str, choices=['reinforce', 'st_gumbel'])
+
+    args = parser.parse_args()
+
+    # seed the RNG
+    torch.manual_seed(args.seed)
+    if args.cuda:
+        torch.cuda.manual_seed(args.seed)
+    np.random.seed(args.seed * 13 / 7)
+
+    return args
+
+if __name__ == '__main__':
+    args = init_config()
+
+    grammar = ASDLGrammar.from_text(open(args.asdl_file).read())
+    transition_system = PythonTransitionSystem(grammar)
+    train_set = Dataset.from_bin_file(args.train_file)
+    vocab = pickle.load(open(args.vocab))
+
+    parser = Parser(args, vocab, transition_system)
+    parser.train()
+    if args.cuda: parser.cuda()
+    optimizer = torch.optim.Adam(parser.parameters(), lr=args.lr)
+
+    epoch = train_iter = 0
+    report_loss = report_examples = 0.
+    while True:
+        epoch += 1
+        epoch_begin = time.time()
+
+        for batch_examples in train_set.batch_iter(batch_size=args.batch_size, shuffle=True):
+            batch_examples = [e for e in batch_examples if len(e.tgt_actions) <= 100]
+            train_iter += 1
+            optimizer.zero_grad()
+
+            loss = -parser.score(batch_examples)
+            # print(loss.data)
+            loss_val = torch.sum(loss).data[0]
+            report_loss += loss_val
+            report_examples += len(batch_examples)
+            loss = torch.mean(loss)
+
+            loss.backward()
+
+            # clip gradient
+            grad_norm = torch.nn.utils.clip_grad_norm(parser.parameters(), args.clip_grad)
+
+            optimizer.step()
+
+            if train_iter % args.log_every == 0:
+                print('[Iter %d] encoder loss=%.5f' %
+                      (train_iter,
+                       report_loss / report_examples),
+                      file=sys.stderr)
+
+                report_loss = report_examples = 0.
+
+        print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr)
+        # perform validation
+        print('[Epoch %d] begin validation' % epoch, file=sys.stderr)
@@ -0,0 +1,94 @@
+# coding=utf-8
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+import torch
+from torch.autograd import Variable
+import numpy as np
+
+
+def dot_prod_attention(h_t, src_encoding, src_encoding_att_linear, mask=None):
+    """
+    :param h_t: (batch_size, hidden_size)
+    :param src_encoding: (batch_size, src_sent_len, hidden_size * 2)
+    :param src_encoding_att_linear: (batch_size, src_sent_len, hidden_size)
+    :param mask: (batch_size, src_sent_len)
+    """
+    # (batch_size, src_sent_len)
+    att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2)
+    if mask is not None:
+        att_weight.data.masked_fill_(mask, -float('inf'))
+    att_weight = F.softmax(att_weight, dim=-1)
+
+    att_view = (att_weight.size(0), 1, att_weight.size(1))
+    # (batch_size, hidden_size)
+    ctx_vec = torch.bmm(att_weight.view(*att_view), src_encoding).squeeze(1)
+
+    return ctx_vec, att_weight
+
+
+def length_array_to_mask_tensor(length_array, cuda=False):
+    max_len = length_array[0]
+    batch_size = len(length_array)
+
+    mask = np.ones((batch_size, max_len), dtype=np.uint8)
+    for i, seq_len in enumerate(length_array):
+        mask[i][:seq_len] = 0
+
+    mask = torch.ByteTensor(mask)
+    return mask.cuda() if cuda else mask
+
+
+def input_transpose(sents, pad_token):
+    """
+    transform the input List[sequence] of size (batch_size, max_sent_len)
+    into a list of size (max_sent_len, batch_size), with proper padding
+    """
+    max_len = max(len(s) for s in sents)
+    batch_size = len(sents)
+
+    sents_t = []
+    masks = []
+    for i in xrange(max_len):
+        sents_t.append([sents[k][i] if len(sents[k]) > i else pad_token for k in xrange(batch_size)])
+        masks.append([1 if len(sents[k]) > i else 0 for k in xrange(batch_size)])
+
+    return sents_t, masks
+
+
+def word2id(sents, vocab):
+    if type(sents[0]) == list:
+        return [[vocab[w] for w in s] for s in sents]
+    else:
+        return [vocab[w] for w in sents]
+
+
+def id2word(sents, vocab):
+    if type(sents[0]) == list:
+        return [[vocab.id2word[w] for w in s] for s in sents]
+    else:
+        return [vocab.id2word[w] for w in sents]
+
+
+def to_input_variable(sequences, vocab, cuda=False, training=True, append_boundary_sym=False):
+    """
+    given a list of sequences,
+    return a tensor of shape (max_sent_len, batch_size)
+    """
+    if append_boundary_sym:
+        sequences = [['<s>'] + seq + ['</s>'] for seq in sequences]
+
+    word_ids = word2id(sequences, vocab)
+    sents_t, masks = input_transpose(word_ids, vocab['<pad>'])
+
+    sents_var = Variable(torch.LongTensor(sents_t), volatile=(not training), requires_grad=False)
+    if cuda:
+        sents_var = sents_var.cuda()
+
+    return sents_var
+
+
+def variable_constr(x, v, cuda=False):
+    return Variable(torch.cuda.x(v)) if cuda else Variable(torch.x(v))
@@ -0,0 +1,38 @@
+# coding=utf-8
+
+import torch
+import torch.nn as nn
+import torch.nn.utils
+from torch.autograd import Variable
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
+
+
+class PointerNet(nn.Module):
+    def __init__(self, args):
+        super(PointerNet, self).__init__()
+
+        self.src_encoding_linear = nn.Linear(args.hidden_size * 2, args.ptrnet_hidden_dim)
+        self.query_vec_linear = nn.Linear(args.hidden_size, args.ptrnet_hidden_dim)
+        self.layer2 = nn.Linear(args.ptrnet_hidden_dim, 1)
+
+    def forward(self, src_encodings, src_token_mask, query_vec):
+        """
+        :param src_encodings: Variable(src_sent_len, batch_size, hidden_size * 2)
+        :param src_token_mask: Variable(src_sent_len, batch_size)
+        :param query_vec: Variable(tgt_action_num, batch_size, hidden_size)
+        :return: Variable(src_sent_len, batch_size, tgt_action_num)
+        """
+
+        # (tgt_action_num, batch_size, src_sent_len, ptrnet_hidden_dim)
+        h1 = torch.tanh(self.src_encoding_linear(src_encodings.permute(1, 0, 2)).unsqueeze(0) + self.query_vec_linear(query_vec).unsqueeze(2))
+        # (tgt_action_num, batch_size, src_sent_len)
+        h2 = self.layer2(h1).squeeze(3)
+        if src_token_mask is not None:
+            # (tgt_action_num, batch_size, src_sent_len)
+            src_token_mask = src_token_mask.unsqueeze(0).expand_as(h2)
+            h2.data.masked_fill_(src_token_mask, -float('inf'))
+
+        ptr_weights = F.softmax(h2, dim=-1)
+
+        return ptr_weights