diff --git a/regparser/grammar/unified.py b/regparser/grammar/unified.py
index 12aa8da..8e860da 100644
--- a/regparser/grammar/unified.py
+++ b/regparser/grammar/unified.py
@@ -112,6 +112,7 @@ def appendix_section(match):
atomic.subpart_marker.copy().setParseAction(keep_pos).setResultsName(
"marker")
+ atomic.subpart)
+
marker_subpart_title = (
atomic.subpart_marker.copy().setParseAction(keep_pos).setResultsName(
"marker")
diff --git a/regparser/tree/xml_parser/reg_text.py b/regparser/tree/xml_parser/reg_text.py
index fbf4288..ec64a30 100644
--- a/regparser/tree/xml_parser/reg_text.py
+++ b/regparser/tree/xml_parser/reg_text.py
@@ -332,8 +332,11 @@ def build_from_section(reg_part, section_xml):
nodes = []
section_nums = []
- for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no):
- section_nums.append(int(match.group(1)))
+ for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no):
+ secnum_candidate = match.group(1)
+ if secnum_candidate.isdigit():
+ secnum_candidate = int(secnum_candidate)
+ section_nums.append(secnum_candidate)
# Span of section numbers
if u'§§' == section_no[:2] and '-' in section_no:
diff --git a/requirements_test.txt b/requirements_test.txt
index 605e75b..755f0af 100644
--- a/requirements_test.txt
+++ b/requirements_test.txt
@@ -1,5 +1,6 @@
cov-core==1.7
coverage==3.6
+datatree==0.1.8.1
flake8==2.4.0
mock==1.0.1
nose==1.2.1
diff --git a/tests/grammar_unified_tests.py b/tests/grammar_unified_tests.py
index c7cd4cc..68b3da7 100644
--- a/tests/grammar_unified_tests.py
+++ b/tests/grammar_unified_tests.py
@@ -39,3 +39,16 @@ def test_marker_comment(self):
result = unified.marker_comment.parseString(t)
self.assertEqual("3", result.section)
self.assertEqual("4", result.c1)
+
+ def test_marker_subpart_title(self):
+ # Typical case:
+ text = u'Subpart K\u2014Exportation'
+ result = unified.marker_subpart_title.parseString(text)
+ self.assertEqual(u'Exportation', result.subpart_title)
+ self.assertEqual(u'K', result.subpart)
+
+ # Reserved subpart:
+ text = u'Subpart J [Reserved]'
+ result = unified.marker_subpart_title.parseString(text)
+ self.assertEqual(u'[Reserved]', result.subpart_title)
+ self.assertEqual(u'J', result.subpart)
diff --git a/tests/tree_xml_parser_reg_text_tests.py b/tests/tree_xml_parser_reg_text_tests.py
index d50f20d..31ce3e5 100644
--- a/tests/tree_xml_parser_reg_text_tests.py
+++ b/tests/tree_xml_parser_reg_text_tests.py
@@ -5,19 +5,21 @@
from mock import patch
from regparser.tree.xml_parser import reg_text
+from tests.xml_builder import LXMLBuilder
class RegTextTest(TestCase):
+ def setUp(self):
+ self.tree = LXMLBuilder()
+
def test_build_from_section_intro_text(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- Some content about this section.
- (a) something something
-
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309")
+ root.SUBJECT("Definitions.")
+ root.P("Some content about this section.")
+ root.P("(a) something something")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
+ self.assertEqual('Some content about this section.', node.text.strip())
self.assertEqual(1, len(node.children))
self.assertEqual(['8675', '309'], node.label)
@@ -59,16 +61,13 @@ def test_build_from_section_unnumbered_defs(self):
self.assertEqual(['8675', '309', 'Bop'], child.label)
def test_build_from_section_collapsed_level(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- (a) Transfers —(1) Notice. follow
-
- (b) Contents (1) Here
-
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309")
+ root.SUBJECT("Definitions.")
+ root.P(_xml=u"""(a) Transfers —(1)
+ Notice. follow""")
+ root.P(_xml="""(b) Contents (1) Here""")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
self.assertEqual(node.label, ['8675', '309'])
self.assertEqual(2, len(node.children))
self.assertEqual(node.children[0].label, ['8675', '309', 'a'])
@@ -80,32 +79,26 @@ def test_build_from_section_collapsed_level(self):
self.assertEqual(1, len(node.children[1].children))
def test_build_from_section_collapsed_level_emph(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- (a) aaaa
- (1) 1111
- (i) iiii
- (A) AAA—(1) eeee
-
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder('SECTION') as root:
+ root.SECTNO(u"§ 8675.309")
+ root.SUBJECT("Definitions.")
+ root.P("(a) aaaa")
+ root.P("(1) 1111")
+ root.P("(i) iiii")
+ root.P(_xml=u"""(A) AAA—(1) eeee""")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
a1iA = node.children[0].children[0].children[0].children[0]
self.assertEqual(u"(A) AAA—", a1iA.text)
self.assertEqual(1, len(a1iA.children))
self.assertEqual("(1) eeee", a1iA.children[0].text.strip())
def test_build_from_section_double_collapsed(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- (a) Keyterm—(1)(i) Content
- (ii) Content2
-
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder('SECTION') as root:
+ root.SECTNO(u'§ 8675.309')
+ root.SUBJECT('Definitions.')
+ root.P(_xml=u"""(a) Keyterm—(1)(i) Content""")
+ root.P("(ii) Content2")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
self.assertEqual(['8675', '309'], node.label)
self.assertEqual(1, len(node.children))
@@ -122,24 +115,20 @@ def test_build_from_section_double_collapsed(self):
self.assertEqual(['8675', '309', 'a', '1', 'ii'], a1ii.label)
def test_build_from_section_reserved(self):
- xml = u"""
-
- § 8675.309
- [Reserved]
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309")
+ root.RESERVED("[Reserved]")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
self.assertEqual(node.label, ['8675', '309'])
self.assertEqual(u'§ 8675.309 [Reserved]', node.title)
self.assertEqual([], node.children)
def test_build_from_section_reserved_range(self):
- xml = u"""
-
- §§ 8675.309-8675.311
- [Reserved]
- """
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§§ 8675.309-8675.311")
+ root.RESERVED("[Reserved]")
n309, n310, n311 = reg_text.build_from_section(
- '8675', etree.fromstring(xml))
+ '8675', self.tree.render_xml())
self.assertEqual(n309.label, ['8675', '309'])
self.assertEqual(n310.label, ['8675', '310'])
self.assertEqual(n311.label, ['8675', '311'])
@@ -147,29 +136,28 @@ def test_build_from_section_reserved_range(self):
self.assertEqual(u'§ 8675.310 [Reserved]', n310.title)
self.assertEqual(u'§ 8675.311 [Reserved]', n311.title)
- def test_build_from_section_ambiguous(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- (g) Some Content
- (h) H Starts
- (1) H-1
- (2) H-2
- (i) Is this 8675-309-h-2-i or 8675-309-i
- %s
-
- """
- n8675_309 = reg_text.build_from_section(
- '8675', etree.fromstring(xml % '(ii) A'))[0]
+ def _setup_for_ambiguous(self, final_par):
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309")
+ root.SUBJECT("Definitions.")
+ root.P("(g) Some Content")
+ root.P("(h) H Starts")
+ root.P("(1) H-1")
+ root.P("(2) H-2")
+ root.P("(i) Is this 8675-309-h-2-i or 8675-309-i")
+ root.P(final_par)
+ return reg_text.build_from_section('8675', self.tree.render_xml())[0]
+
+ def test_build_from_section_ambiguous_ii(self):
+ n8675_309 = self._setup_for_ambiguous("(ii) A")
n8675_309_h = n8675_309.children[1]
n8675_309_h_2 = n8675_309_h.children[1]
self.assertEqual(2, len(n8675_309.children))
self.assertEqual(2, len(n8675_309_h.children))
self.assertEqual(2, len(n8675_309_h_2.children))
- n8675_309 = reg_text.build_from_section(
- '8675', etree.fromstring(xml % '(A) B'))[0]
+ def test_build_from_section_ambiguous_A(self):
+ n8675_309 = self._setup_for_ambiguous("(A) B")
n8675_309_h = n8675_309.children[1]
n8675_309_h_2 = n8675_309_h.children[1]
n8675_309_h_2_i = n8675_309_h_2.children[0]
@@ -178,12 +166,12 @@ def test_build_from_section_ambiguous(self):
self.assertEqual(1, len(n8675_309_h_2.children))
self.assertEqual(1, len(n8675_309_h_2_i.children))
- n8675_309 = reg_text.build_from_section(
- '8675', etree.fromstring(xml % '(1) C'))[0]
+ def test_build_from_section_ambiguous_1(self):
+ n8675_309 = self._setup_for_ambiguous("(1) C")
self.assertEqual(3, len(n8675_309.children))
- n8675_309 = reg_text.build_from_section(
- '8675', etree.fromstring(xml % '(3) D'))[0]
+ def test_build_from_section_ambiguous_3(self):
+ n8675_309 = self._setup_for_ambiguous("(3) D")
n8675_309_h = n8675_309.children[1]
n8675_309_h_2 = n8675_309_h.children[1]
self.assertEqual(2, len(n8675_309.children))
@@ -191,17 +179,14 @@ def test_build_from_section_ambiguous(self):
self.assertEqual(1, len(n8675_309_h_2.children))
def test_build_from_section_collapsed(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- (a) aaa
- (1) 111
- (2) 222—(i) iii. (A) AAA
- (B) BBB
-
- """
- n309 = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309")
+ root.SUBJECT("Definitions.")
+ root.P("(a) aaa")
+ root.P("(1) 111")
+ root.P(_xml=u"""(2) 222—(i) iii. (A) AAA""")
+ root.P("(B) BBB")
+ n309 = reg_text.build_from_section('8675', self.tree.render_xml())[0]
self.assertEqual(1, len(n309.children))
n309_a = n309.children[0]
self.assertEqual(2, len(n309_a.children))
@@ -211,19 +196,16 @@ def test_build_from_section_collapsed(self):
self.assertEqual(2, len(n309_a_2_i.children))
def test_build_from_section_italic_levels(self):
- xml = u"""
-
- § 8675.309
- Definitions.
- (a) aaa
- (1) 111
- (i) iii
- (A) AAA
- (1) i1i1i1
- \n(2) i2i2i2
-
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309")
+ root.SUBJECT("Definitions.")
+ root.P("(a) aaa")
+ root.P("(1) 111")
+ root.P("(i) iii")
+ root.P("(A) AAA")
+ root.P(_xml="""(1) i1i1i1""")
+ root.P(_xml="""\n(2) i2i2i2""")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
self.assertEqual(1, len(node.children))
self.assertEqual(node.label, ['8675', '309'])
@@ -248,25 +230,29 @@ def test_build_from_section_italic_levels(self):
self.assertEqual(n2.label, ['8675', '309', 'a', '1', 'i', 'A', '2'])
def test_build_from_section_bad_spaces(self):
- xml = u"""
-
- § 8675.16
- Subby Sub Sub.
-
- (b)General.Content Content.
-
- """
- node = reg_text.build_from_section('8675', etree.fromstring(xml))[0]
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.16")
+ root.SUBJECT("Subby Sub Sub.")
+ root.STARS()
+ root.P(_xml="""(b)General.Content Content.""")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
self.assertEqual(1, len(node.children))
nb = node.children[0]
self.assertEqual(nb.text.strip(), "(b) General. Content Content.")
+ def test_build_from_section_section_with_nondigits(self):
+ with self.tree.builder("SECTION") as root:
+ root.SECTNO(u"§ 8675.309a")
+ root.SUBJECT("Definitions.")
+ root.P("Intro content here")
+ node = reg_text.build_from_section('8675', self.tree.render_xml())[0]
+ self.assertEqual(node.label, ['8675', '309a'])
+ self.assertEqual(0, len(node.children))
+
def test_get_title(self):
- xml = u"""
-
- regulation title
- """
- title = reg_text.get_title(etree.fromstring(xml))
+ with self.tree.builder("PART") as root:
+ root.HD("regulation title")
+ title = reg_text.get_title(self.tree.render_xml())
self.assertEqual(u'regulation title', title)
def test_get_reg_part(self):
@@ -281,42 +267,37 @@ def test_get_reg_part(self):
self.assertEqual(part, '204')
def test_get_reg_part_fr_notice_style(self):
- xml = u"""
-
-
-
- """
- part = reg_text.get_reg_part(etree.fromstring(xml))
+ with self.tree.builder("REGTEXT", PART="204") as root:
+ root.SECTION("\n")
+ part = reg_text.get_reg_part(self.tree.render_xml())
self.assertEqual(part, '204')
def test_get_subpart_title(self):
- xml = u"""
-
- Subpart A—First subpart
- """
- subpart_title = reg_text.get_subpart_title(etree.fromstring(xml))
+ with self.tree.builder("SUBPART") as root:
+ root.HD(u"Subpart A—First subpart")
+ subpart_title = reg_text.get_subpart_title(self.tree.render_xml())
self.assertEqual(subpart_title, u'Subpart A—First subpart')
+ def test_get_subpart_title_reserved(self):
+ with self.tree.builder("SUBPART") as root:
+ root.RESERVED("Subpart J [Reserved]")
+ subpart_title = reg_text.get_subpart_title(self.tree.render_xml())
+ self.assertEqual(subpart_title, u'Subpart J [Reserved]')
+
def test_build_subpart(self):
- xml = u"""
-
- Subpart A—First subpart
-
- § 8675.309
- Definitions.
- Some content about this section.
- (a) something something
-
-
- § 8675.310
- Definitions.
- Some content about this section.
- (a) something something
-
-
- """
- subpart = reg_text.build_subpart('8675', etree.fromstring(xml))
+ with self.tree.builder("SUBPART") as root:
+ root.HD(u"Subpart A—First subpart")
+ with root.SECTION() as section:
+ section.SECTNO(u"§ 8675.309")
+ section.SUBJECT("Definitions.")
+ section.P("Some content about this section.")
+ section.P("(a) something something")
+ with root.SECTION() as section:
+ section.SECTNO(u"§ 8675.310")
+ section.SUBJECT("Definitions.")
+ section.P("Some content about this section.")
+ section.P("(a) something something")
+ subpart = reg_text.build_subpart('8675', self.tree.render_xml())
self.assertEqual(subpart.node_type, 'subpart')
self.assertEqual(len(subpart.children), 2)
self.assertEqual(subpart.label, ['8675', 'Subpart', 'A'])
@@ -368,47 +349,38 @@ def test_get_markers_bad_citation(self):
@patch('regparser.tree.xml_parser.reg_text.content')
def test_preprocess_xml(self, content):
- xml = etree.fromstring("""
-
-
-
- Other Text
-
- ABCD.0123
-
-
-
- """)
+ with self.tree.builder("CFRGRANULE") as root:
+ with root.PART() as part:
+ with part.APPENDIX() as appendix:
+ appendix.TAG("Other Text")
+ with appendix.GPH(DEEP=453, SPAN=2) as gph:
+ gph.GID("ABCD.0123")
content.Macros.return_value = [
- ("//GID[./text()='ABCD.0123']/..", """
- Some Title
-
- EFGH.0123
- """)]
- reg_text.preprocess_xml(xml)
- should_be = etree.fromstring("""
-
-
-
- Other Text
- Some Title
-
- EFGH.0123
-
-
- """)
-
- self.assertEqual(etree.tostring(xml), etree.tostring(should_be))
+ ("//GID[./text()='ABCD.0123']/..",
+ """Some Title"""
+ """EFGH.0123""")]
+ orig_xml = self.tree.render_xml()
+ reg_text.preprocess_xml(orig_xml)
+
+ self.setUp()
+ with self.tree.builder("CFRGRANULE") as root:
+ with root.PART() as part:
+ with part.APPENDIX() as appendix:
+ appendix.TAG("Other Text")
+ appendix.HD("Some Title", SOURCE="HD1")
+ with appendix.GPH(DEEP=453, SPAN=2) as gph:
+ gph.GID("EFGH.0123")
+
+ self.assertEqual(etree.tostring(orig_xml), self.tree.render_string())
def test_next_marker_stars(self):
- xml = etree.fromstring("""
-
- (i) Content
-
-
-
- (xi) More
- """)
+ with self.tree.builder("ROOT") as root:
+ root.P("(i) Content")
+ root.STARS()
+ root.PRTPAGE()
+ root.STARS()
+ root.P("(xi) More")
+ xml = self.tree.render_xml()
self.assertEqual('xi', reg_text.next_marker(xml.getchildren()[0], []))
def test_build_from_section_double_alpha(self):
diff --git a/tests/xml_builder.py b/tests/xml_builder.py
new file mode 100644
index 0000000..fbeae86
--- /dev/null
+++ b/tests/xml_builder.py
@@ -0,0 +1,65 @@
+from contextlib import contextmanager
+
+import datatree
+from datatree.render.base import Renderer
+from lxml import etree
+
+
+class LXMLBuilder(object):
+ """Wrapper around a datatree which provides `render` methods and removes a
+ bit of the redundancy found in tests. See
+ tests/tree_xml_parser_reg_text_tests.py for example usage"""
+
+ @contextmanager
+ def builder(self, root_tag, **kwargs):
+ """Create a datatree with the root_tag at the root"""
+ tree = datatree.Tree()
+ tree.register_renderer(LXMLRenderer)
+ with getattr(tree, root_tag)(**kwargs) as root:
+ yield root
+ self.root = root
+
+ def render_xml(self):
+ return self.root.render('lxml', as_root=True)
+
+ def render_string(self):
+ return etree.tostring(self.render_xml())
+
+
+class LXMLRenderer(Renderer):
+ """Outputs lxml tree nodes. Based on the etree renderer"""
+ friendly_names = ['lxml']
+
+ def render_verbatim(self, tag, xml_str):
+ """It's sometimes easier to describe the node with raw XML"""
+ return etree.fromstring(u'<{0}>{1}{0}>'.format(tag, xml_str))
+
+ def render_attributes(self, node):
+ """Normal path: attributes are described via __attrs__"""
+ attrs = {}
+ for key, value in node.__attrs__.iteritems():
+ attrs[key] = str(value)
+ element = etree.Element(node.__node_name__, attrs)
+ element.text = node.__value__ or ""
+ return element
+
+ def render_node(self, node, parent=None, options={}):
+ """Generate the current node, potentially adding it to a parent, then
+ recurse on children"""
+ if '_xml' in node.__attrs__:
+ element = self.render_verbatim(node.__node_name__,
+ node.__attrs__['_xml'])
+ else:
+ element = self.render_attributes(node)
+
+ if parent is not None:
+ parent.append(element)
+
+ for child in node.__children__:
+ self.render_node(child, element)
+
+ return element
+
+ def render_final(self, rendered, options={}):
+ """Part of the Renderer interface"""
+ return rendered