diff --git a/regparser/grammar/unified.py b/regparser/grammar/unified.py index 12aa8da..8e860da 100644 --- a/regparser/grammar/unified.py +++ b/regparser/grammar/unified.py @@ -112,6 +112,7 @@ def appendix_section(match): atomic.subpart_marker.copy().setParseAction(keep_pos).setResultsName( "marker") + atomic.subpart) + marker_subpart_title = ( atomic.subpart_marker.copy().setParseAction(keep_pos).setResultsName( "marker") diff --git a/regparser/tree/xml_parser/reg_text.py b/regparser/tree/xml_parser/reg_text.py index fbf4288..ec64a30 100644 --- a/regparser/tree/xml_parser/reg_text.py +++ b/regparser/tree/xml_parser/reg_text.py @@ -332,8 +332,11 @@ def build_from_section(reg_part, section_xml): nodes = [] section_nums = [] - for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): - section_nums.append(int(match.group(1))) + for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no): + secnum_candidate = match.group(1) + if secnum_candidate.isdigit(): + secnum_candidate = int(secnum_candidate) + section_nums.append(secnum_candidate) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: diff --git a/requirements_test.txt b/requirements_test.txt index 605e75b..755f0af 100644 --- a/requirements_test.txt +++ b/requirements_test.txt @@ -1,5 +1,6 @@ cov-core==1.7 coverage==3.6 +datatree==0.1.8.1 flake8==2.4.0 mock==1.0.1 nose==1.2.1 diff --git a/tests/grammar_unified_tests.py b/tests/grammar_unified_tests.py index c7cd4cc..68b3da7 100644 --- a/tests/grammar_unified_tests.py +++ b/tests/grammar_unified_tests.py @@ -39,3 +39,16 @@ def test_marker_comment(self): result = unified.marker_comment.parseString(t) self.assertEqual("3", result.section) self.assertEqual("4", result.c1) + + def test_marker_subpart_title(self): + # Typical case: + text = u'Subpart K\u2014Exportation' + result = unified.marker_subpart_title.parseString(text) + self.assertEqual(u'Exportation', result.subpart_title) + self.assertEqual(u'K', result.subpart) + + # Reserved subpart: + text = u'Subpart J [Reserved]' + result = unified.marker_subpart_title.parseString(text) + self.assertEqual(u'[Reserved]', result.subpart_title) + self.assertEqual(u'J', result.subpart) diff --git a/tests/tree_xml_parser_reg_text_tests.py b/tests/tree_xml_parser_reg_text_tests.py index d50f20d..31ce3e5 100644 --- a/tests/tree_xml_parser_reg_text_tests.py +++ b/tests/tree_xml_parser_reg_text_tests.py @@ -5,19 +5,21 @@ from mock import patch from regparser.tree.xml_parser import reg_text +from tests.xml_builder import LXMLBuilder class RegTextTest(TestCase): + def setUp(self): + self.tree = LXMLBuilder() + def test_build_from_section_intro_text(self): - xml = u""" -
- § 8675.309 - Definitions. -

Some content about this section.

-

(a) something something

-
- """ - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309") + root.SUBJECT("Definitions.") + root.P("Some content about this section.") + root.P("(a) something something") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] + self.assertEqual('Some content about this section.', node.text.strip()) self.assertEqual(1, len(node.children)) self.assertEqual(['8675', '309'], node.label) @@ -59,16 +61,13 @@ def test_build_from_section_unnumbered_defs(self): self.assertEqual(['8675', '309', 'Bop'], child.label) def test_build_from_section_collapsed_level(self): - xml = u""" -
- § 8675.309 - Definitions. -

(a) Transfers —(1) Notice. follow -

-

(b) Contents (1) Here

-
- """ - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309") + root.SUBJECT("Definitions.") + root.P(_xml=u"""(a) Transfers —(1) + Notice. follow""") + root.P(_xml="""(b) Contents (1) Here""") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] self.assertEqual(node.label, ['8675', '309']) self.assertEqual(2, len(node.children)) self.assertEqual(node.children[0].label, ['8675', '309', 'a']) @@ -80,32 +79,26 @@ def test_build_from_section_collapsed_level(self): self.assertEqual(1, len(node.children[1].children)) def test_build_from_section_collapsed_level_emph(self): - xml = u""" -
- § 8675.309 - Definitions. -

(a) aaaa

-

(1) 1111

-

(i) iiii

-

(A) AAA—(1) eeee

-
- """ - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder('SECTION') as root: + root.SECTNO(u"§ 8675.309") + root.SUBJECT("Definitions.") + root.P("(a) aaaa") + root.P("(1) 1111") + root.P("(i) iiii") + root.P(_xml=u"""(A) AAA—(1) eeee""") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] a1iA = node.children[0].children[0].children[0].children[0] self.assertEqual(u"(A) AAA—", a1iA.text) self.assertEqual(1, len(a1iA.children)) self.assertEqual("(1) eeee", a1iA.children[0].text.strip()) def test_build_from_section_double_collapsed(self): - xml = u""" -
- § 8675.309 - Definitions. -

(a) Keyterm—(1)(i) Content

-

(ii) Content2

-
- """ - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder('SECTION') as root: + root.SECTNO(u'§ 8675.309') + root.SUBJECT('Definitions.') + root.P(_xml=u"""(a) Keyterm—(1)(i) Content""") + root.P("(ii) Content2") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] self.assertEqual(['8675', '309'], node.label) self.assertEqual(1, len(node.children)) @@ -122,24 +115,20 @@ def test_build_from_section_double_collapsed(self): self.assertEqual(['8675', '309', 'a', '1', 'ii'], a1ii.label) def test_build_from_section_reserved(self): - xml = u""" -
- § 8675.309 - [Reserved] -
""" - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309") + root.RESERVED("[Reserved]") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] self.assertEqual(node.label, ['8675', '309']) self.assertEqual(u'§ 8675.309 [Reserved]', node.title) self.assertEqual([], node.children) def test_build_from_section_reserved_range(self): - xml = u""" -
- §§ 8675.309-8675.311 - [Reserved] -
""" + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§§ 8675.309-8675.311") + root.RESERVED("[Reserved]") n309, n310, n311 = reg_text.build_from_section( - '8675', etree.fromstring(xml)) + '8675', self.tree.render_xml()) self.assertEqual(n309.label, ['8675', '309']) self.assertEqual(n310.label, ['8675', '310']) self.assertEqual(n311.label, ['8675', '311']) @@ -147,29 +136,28 @@ def test_build_from_section_reserved_range(self): self.assertEqual(u'§ 8675.310 [Reserved]', n310.title) self.assertEqual(u'§ 8675.311 [Reserved]', n311.title) - def test_build_from_section_ambiguous(self): - xml = u""" -
- § 8675.309 - Definitions. -

(g) Some Content

-

(h) H Starts

-

(1) H-1

-

(2) H-2

-

(i) Is this 8675-309-h-2-i or 8675-309-i

-

%s

-
- """ - n8675_309 = reg_text.build_from_section( - '8675', etree.fromstring(xml % '(ii) A'))[0] + def _setup_for_ambiguous(self, final_par): + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309") + root.SUBJECT("Definitions.") + root.P("(g) Some Content") + root.P("(h) H Starts") + root.P("(1) H-1") + root.P("(2) H-2") + root.P("(i) Is this 8675-309-h-2-i or 8675-309-i") + root.P(final_par) + return reg_text.build_from_section('8675', self.tree.render_xml())[0] + + def test_build_from_section_ambiguous_ii(self): + n8675_309 = self._setup_for_ambiguous("(ii) A") n8675_309_h = n8675_309.children[1] n8675_309_h_2 = n8675_309_h.children[1] self.assertEqual(2, len(n8675_309.children)) self.assertEqual(2, len(n8675_309_h.children)) self.assertEqual(2, len(n8675_309_h_2.children)) - n8675_309 = reg_text.build_from_section( - '8675', etree.fromstring(xml % '(A) B'))[0] + def test_build_from_section_ambiguous_A(self): + n8675_309 = self._setup_for_ambiguous("(A) B") n8675_309_h = n8675_309.children[1] n8675_309_h_2 = n8675_309_h.children[1] n8675_309_h_2_i = n8675_309_h_2.children[0] @@ -178,12 +166,12 @@ def test_build_from_section_ambiguous(self): self.assertEqual(1, len(n8675_309_h_2.children)) self.assertEqual(1, len(n8675_309_h_2_i.children)) - n8675_309 = reg_text.build_from_section( - '8675', etree.fromstring(xml % '(1) C'))[0] + def test_build_from_section_ambiguous_1(self): + n8675_309 = self._setup_for_ambiguous("(1) C") self.assertEqual(3, len(n8675_309.children)) - n8675_309 = reg_text.build_from_section( - '8675', etree.fromstring(xml % '(3) D'))[0] + def test_build_from_section_ambiguous_3(self): + n8675_309 = self._setup_for_ambiguous("(3) D") n8675_309_h = n8675_309.children[1] n8675_309_h_2 = n8675_309_h.children[1] self.assertEqual(2, len(n8675_309.children)) @@ -191,17 +179,14 @@ def test_build_from_section_ambiguous(self): self.assertEqual(1, len(n8675_309_h_2.children)) def test_build_from_section_collapsed(self): - xml = u""" -
- § 8675.309 - Definitions. -

(a) aaa

-

(1) 111

-

(2) 222—(i) iii. (A) AAA

-

(B) BBB

-
- """ - n309 = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309") + root.SUBJECT("Definitions.") + root.P("(a) aaa") + root.P("(1) 111") + root.P(_xml=u"""(2) 222—(i) iii. (A) AAA""") + root.P("(B) BBB") + n309 = reg_text.build_from_section('8675', self.tree.render_xml())[0] self.assertEqual(1, len(n309.children)) n309_a = n309.children[0] self.assertEqual(2, len(n309_a.children)) @@ -211,19 +196,16 @@ def test_build_from_section_collapsed(self): self.assertEqual(2, len(n309_a_2_i.children)) def test_build_from_section_italic_levels(self): - xml = u""" -
- § 8675.309 - Definitions. -

(a) aaa

-

(1) 111

-

(i) iii

-

(A) AAA

-

(1) i1i1i1

-

\n(2) i2i2i2

-
- """ - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309") + root.SUBJECT("Definitions.") + root.P("(a) aaa") + root.P("(1) 111") + root.P("(i) iii") + root.P("(A) AAA") + root.P(_xml="""(1) i1i1i1""") + root.P(_xml="""\n(2) i2i2i2""") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] self.assertEqual(1, len(node.children)) self.assertEqual(node.label, ['8675', '309']) @@ -248,25 +230,29 @@ def test_build_from_section_italic_levels(self): self.assertEqual(n2.label, ['8675', '309', 'a', '1', 'i', 'A', '2']) def test_build_from_section_bad_spaces(self): - xml = u""" -
- § 8675.16 - Subby Sub Sub. - -

(b)General.Content Content.

-
- """ - node = reg_text.build_from_section('8675', etree.fromstring(xml))[0] + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.16") + root.SUBJECT("Subby Sub Sub.") + root.STARS() + root.P(_xml="""(b)General.Content Content.""") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] self.assertEqual(1, len(node.children)) nb = node.children[0] self.assertEqual(nb.text.strip(), "(b) General. Content Content.") + def test_build_from_section_section_with_nondigits(self): + with self.tree.builder("SECTION") as root: + root.SECTNO(u"§ 8675.309a") + root.SUBJECT("Definitions.") + root.P("Intro content here") + node = reg_text.build_from_section('8675', self.tree.render_xml())[0] + self.assertEqual(node.label, ['8675', '309a']) + self.assertEqual(0, len(node.children)) + def test_get_title(self): - xml = u""" - - regulation title - """ - title = reg_text.get_title(etree.fromstring(xml)) + with self.tree.builder("PART") as root: + root.HD("regulation title") + title = reg_text.get_title(self.tree.render_xml()) self.assertEqual(u'regulation title', title) def test_get_reg_part(self): @@ -281,42 +267,37 @@ def test_get_reg_part(self): self.assertEqual(part, '204') def test_get_reg_part_fr_notice_style(self): - xml = u""" - -
-
-
- """ - part = reg_text.get_reg_part(etree.fromstring(xml)) + with self.tree.builder("REGTEXT", PART="204") as root: + root.SECTION("\n") + part = reg_text.get_reg_part(self.tree.render_xml()) self.assertEqual(part, '204') def test_get_subpart_title(self): - xml = u""" - - Subpart A—First subpart - """ - subpart_title = reg_text.get_subpart_title(etree.fromstring(xml)) + with self.tree.builder("SUBPART") as root: + root.HD(u"Subpart A—First subpart") + subpart_title = reg_text.get_subpart_title(self.tree.render_xml()) self.assertEqual(subpart_title, u'Subpart A—First subpart') + def test_get_subpart_title_reserved(self): + with self.tree.builder("SUBPART") as root: + root.RESERVED("Subpart J [Reserved]") + subpart_title = reg_text.get_subpart_title(self.tree.render_xml()) + self.assertEqual(subpart_title, u'Subpart J [Reserved]') + def test_build_subpart(self): - xml = u""" - - Subpart A—First subpart -
- § 8675.309 - Definitions. -

Some content about this section.

-

(a) something something

-
-
- § 8675.310 - Definitions. -

Some content about this section.

-

(a) something something

-
-
- """ - subpart = reg_text.build_subpart('8675', etree.fromstring(xml)) + with self.tree.builder("SUBPART") as root: + root.HD(u"Subpart A—First subpart") + with root.SECTION() as section: + section.SECTNO(u"§ 8675.309") + section.SUBJECT("Definitions.") + section.P("Some content about this section.") + section.P("(a) something something") + with root.SECTION() as section: + section.SECTNO(u"§ 8675.310") + section.SUBJECT("Definitions.") + section.P("Some content about this section.") + section.P("(a) something something") + subpart = reg_text.build_subpart('8675', self.tree.render_xml()) self.assertEqual(subpart.node_type, 'subpart') self.assertEqual(len(subpart.children), 2) self.assertEqual(subpart.label, ['8675', 'Subpart', 'A']) @@ -368,47 +349,38 @@ def test_get_markers_bad_citation(self): @patch('regparser.tree.xml_parser.reg_text.content') def test_preprocess_xml(self, content): - xml = etree.fromstring(""" - - - - Other Text - - ABCD.0123 - - - - """) + with self.tree.builder("CFRGRANULE") as root: + with root.PART() as part: + with part.APPENDIX() as appendix: + appendix.TAG("Other Text") + with appendix.GPH(DEEP=453, SPAN=2) as gph: + gph.GID("ABCD.0123") content.Macros.return_value = [ - ("//GID[./text()='ABCD.0123']/..", """ - Some Title - - EFGH.0123 - """)] - reg_text.preprocess_xml(xml) - should_be = etree.fromstring(""" - - - - Other Text - Some Title - - EFGH.0123 - - - """) - - self.assertEqual(etree.tostring(xml), etree.tostring(should_be)) + ("//GID[./text()='ABCD.0123']/..", + """Some Title""" + """EFGH.0123""")] + orig_xml = self.tree.render_xml() + reg_text.preprocess_xml(orig_xml) + + self.setUp() + with self.tree.builder("CFRGRANULE") as root: + with root.PART() as part: + with part.APPENDIX() as appendix: + appendix.TAG("Other Text") + appendix.HD("Some Title", SOURCE="HD1") + with appendix.GPH(DEEP=453, SPAN=2) as gph: + gph.GID("EFGH.0123") + + self.assertEqual(etree.tostring(orig_xml), self.tree.render_string()) def test_next_marker_stars(self): - xml = etree.fromstring(""" - -

(i) Content

- - - -

(xi) More

-
""") + with self.tree.builder("ROOT") as root: + root.P("(i) Content") + root.STARS() + root.PRTPAGE() + root.STARS() + root.P("(xi) More") + xml = self.tree.render_xml() self.assertEqual('xi', reg_text.next_marker(xml.getchildren()[0], [])) def test_build_from_section_double_alpha(self): diff --git a/tests/xml_builder.py b/tests/xml_builder.py new file mode 100644 index 0000000..fbeae86 --- /dev/null +++ b/tests/xml_builder.py @@ -0,0 +1,65 @@ +from contextlib import contextmanager + +import datatree +from datatree.render.base import Renderer +from lxml import etree + + +class LXMLBuilder(object): + """Wrapper around a datatree which provides `render` methods and removes a + bit of the redundancy found in tests. See + tests/tree_xml_parser_reg_text_tests.py for example usage""" + + @contextmanager + def builder(self, root_tag, **kwargs): + """Create a datatree with the root_tag at the root""" + tree = datatree.Tree() + tree.register_renderer(LXMLRenderer) + with getattr(tree, root_tag)(**kwargs) as root: + yield root + self.root = root + + def render_xml(self): + return self.root.render('lxml', as_root=True) + + def render_string(self): + return etree.tostring(self.render_xml()) + + +class LXMLRenderer(Renderer): + """Outputs lxml tree nodes. Based on the etree renderer""" + friendly_names = ['lxml'] + + def render_verbatim(self, tag, xml_str): + """It's sometimes easier to describe the node with raw XML""" + return etree.fromstring(u'<{0}>{1}'.format(tag, xml_str)) + + def render_attributes(self, node): + """Normal path: attributes are described via __attrs__""" + attrs = {} + for key, value in node.__attrs__.iteritems(): + attrs[key] = str(value) + element = etree.Element(node.__node_name__, attrs) + element.text = node.__value__ or "" + return element + + def render_node(self, node, parent=None, options={}): + """Generate the current node, potentially adding it to a parent, then + recurse on children""" + if '_xml' in node.__attrs__: + element = self.render_verbatim(node.__node_name__, + node.__attrs__['_xml']) + else: + element = self.render_attributes(node) + + if parent is not None: + parent.append(element) + + for child in node.__children__: + self.render_node(child, element) + + return element + + def render_final(self, rendered, options={}): + """Part of the Renderer interface""" + return rendered