UIUCLibrary · Copilot · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -0,0 +1,98 @@
+# Copilot Agent Instructions for arcflow
+
+This file provides guidance for GitHub Copilot agents working on the arcflow repository.
+
+## Commit Style
+
+When making changes to this repository, use **granular, single-purpose commits**:
+
+### Guidelines
+
+- **One commit per logical change** - Each commit should do one thing and do it well
+- **Separate refactoring from features** - Don't mix code restructuring with new functionality
+- **Clear, descriptive messages** - Explain what the commit does and why
+- **Include imports with usage** - Add necessary imports in the same commit where they're used, not as separate commits
+
+### Examples
+
+Good commit sequence:
+```
+1. Refactor XML injection logic for extensibility
+2. Add linked_agents to resolve parameter
+3. Add get_creator_bioghist method
+   (includes import of xml.sax.saxutils.escape used in the method)
+4. Integrate bioghist into XML injection
+5. Update comment to reflect new behavior
+```
+
+Bad commit sequences:
+
+Too dense:
+```
+1. Add creator biographical information to EAD XML exports
+   (combines refactoring, new imports, new methods, and integration)
+```
+
+Too granular:
+```
+1. Import xml.sax.saxutils.escape
+2. Add get_creator_bioghist method that uses xml.sax.saxutils.escape
+   (import should have been included in this commit)
+```
+
+### Commit Message Format
+
+- **First line**: Clear, concise summary (50-72 characters)
+- **Body** (optional): Bullet points explaining the changes
+- **Keep it focused**: If you need many bullets, consider splitting into multiple commits
+
+### Why This Matters
+
+- Makes code review easier
+- Helps understand the progression of changes
+- Easier to revert specific changes if needed
+- Clear history for future maintainers
+
+---
+
+## XML Content Handling in EAD Pipeline
+
+When injecting content into EAD XML files, distinguish between plain text and structured XML:
+
+### Escaping Strategy
+
+- **Plain text labels** (recordgroup, subgroup): Use `xml_escape()` to escape special characters (`&`, `<`, `>`)
+  - These are simple strings that may contain characters that break XML syntax
+  - Example: `xml_escape(rg_label)` → converts `"Group & Co"` to `"Group &amp; Co"`
+
+- **Structured EAD XML content** (bioghist, scopecontent): Do NOT escape
+  - Content from ArchivesSpace already contains valid EAD XML markup (`<emph>`, `<title>`, etc.)
+  - These are legitimate XML nodes that must be preserved
+  - Escaping would convert them to literal text: `<emph>` → `&lt;emph&gt;`
+  - Example: Pass through as-is: `f'<p>{subnote["content"]}</p>'`
+
+### Why This Matters
+
+The Traject indexing pipeline and ArcLight display rely on proper XML structure:
+1. Traject's `.to_html` converts XML nodes to HTML
+2. ArcLight's `render_html_tags` processes the HTML for display
+3. If XML nodes are escaped (treated as text), they can't be processed and appear as raw markup
+
+### Pattern for Future Fields
+
+When adding new EAD fields to the pipeline:
+1. Determine if content is plain text or structured XML
+2. Apply escaping only to plain text
+3. Pass structured XML through unchanged
+4. Document the decision in code comments
+
+---
+
+## Adding More Instructions
+
+To add additional instructions to this file:
+
+1. Add a new section with a clear heading (e.g., `## Testing Strategy`, `## Code Style`)
+2. Keep instructions concise and actionable
+3. Use examples where helpful
+4. Maintain the simple, scannable format
diff --git a/arcflow/main.py b/arcflow/main.py
@@ -9,6 +9,7 @@
 import re
 import logging
 from xml.dom.pulldom import parse, START_ELEMENT
+from xml.sax.saxutils import escape as xml_escape
 from datetime import datetime, timezone
 from asnake.client import ASnakeClient
 from multiprocessing.pool import ThreadPool as Pool
@@ -205,7 +206,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
         resource = self.client.get(
             f'{repo["uri"]}/resources/{resource_id}',
             params={
-                'resolve': ['classifications', 'classification_terms'],
+                'resolve': ['classifications', 'classification_terms', 'linked_agents'],
             }).json()
 
         xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml'
@@ -225,24 +226,41 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
                     'ead3': 'false',
                 })
 
-            # add record group and subgroup labels to EAD inside <archdesc level="collection">
+            # add custom XML elements to EAD inside <archdesc level="collection">
+            # (record group/subgroup labels and biographical/historical notes)
             if xml.content:
-                rg_label, sg_label = extract_labels(resource)[1:3]
-                if rg_label:
-                    xml_content = xml.content.decode('utf-8')
-                    insert_pos = xml_content.find('<archdesc level="collection">')
+                xml_content = xml.content.decode('utf-8')
+                insert_pos = xml_content.find('<archdesc level="collection">')
+
+                if insert_pos != -1:
+                    # Find the position after the closing </did> tag
+                    insert_pos = xml_content.find('</did>', insert_pos)
+
                     if insert_pos != -1:
-                        # Find the position after the opening tag
-                        insert_pos = xml_content.find('</did>', insert_pos)
-                        extra_xml = f'<recordgroup>{rg_label}</recordgroup>'
-                        if sg_label:
-                            extra_xml += f'<subgroup>{sg_label}</subgroup>'
-                        xml_content = (xml_content[:insert_pos] + 
-                            extra_xml + 
-                            xml_content[insert_pos:])
-                    xml_content = xml_content.encode('utf-8')
-                else:
-                    xml_content = xml.content
+                        # Move to after the </did> tag
+                        insert_pos += len('</did>')
+                        extra_xml = ''
+
+                        # Add record group and subgroup labels
+                        rg_label, sg_label = extract_labels(resource)[1:3]
+                        if rg_label:
+                            extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>'
+                            if sg_label:
+                                extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>'
+
+                        # Add biographical/historical notes from creator agents
+                        bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
+                        if bioghist_content:
+                            extra_xml += f'\n{bioghist_content}'
+
+                        if extra_xml:
+                            xml_content = (xml_content[:insert_pos] + 
+                                extra_xml + 
+                                xml_content[insert_pos:])
+
+                xml_content = xml_content.encode('utf-8')
+            else:
+                xml_content = xml.content
 
             # next level of indentation for nested operations
             indent_size += 2
@@ -499,6 +517,64 @@ def index(self, repo_id, xml_file_path, indent_size=0):
             self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}')
 
 
+    def get_creator_bioghist(self, resource, indent_size=0):
+        """
+        Get biographical/historical notes from creator agents linked to the resource.
+        Returns nested bioghist elements for each creator, or None if no creator agents have notes.
+        Each bioghist element includes the creator name in a head element and an id attribute.
+        """
+        indent = ' ' * indent_size
+        bioghist_elements = []
+
+        if 'linked_agents' not in resource:
+            return None
+
+        # Process linked_agents in order to maintain consistency with origination order
+        for linked_agent in resource['linked_agents']:
+            # Only process agents with 'creator' role
+            if linked_agent.get('role') == 'creator':
+                agent_ref = linked_agent.get('ref')
+                if agent_ref:
+                    try:
+                        agent = self.client.get(agent_ref).json()
+
+                        # Extract agent ID from URI for id attribute
+                        agent_id = agent_ref.split('/')[-1] if agent_ref else ''
+
+                        # Get agent name for head element
+                        agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown')
+
+                        # Check for notes in the agent record
+                        if 'notes' in agent:
+                            for note in agent['notes']:
+                                # Look for biographical/historical notes
+                                if note.get('jsonmodel_type') == 'note_bioghist':
+                                    # Extract note content from subnotes
+                                    paragraphs = []
+                                    if 'subnotes' in note:
+                                        for subnote in note['subnotes']:
+                                            if 'content' in subnote:
+                                                # Split content on single newlines to create paragraphs
+                                                content = subnote['content']
+                                                # Split on newline and filter out empty strings
+                                                lines = [line.strip() for line in content.split('\n') if line.strip()]
+                                                # Wrap each line in <p> tags
+                                                for line in lines:
+                                                    paragraphs.append(f'<p>{line}</p>')
+
+                                    # Create nested bioghist element if we have paragraphs
+                                    if paragraphs:
+                                        paragraphs_xml = ''.join(paragraphs)
+                                        bioghist_el = f'<bioghist id="aspace_{agent_id}"><head>{xml_escape(agent_name)}</head>{paragraphs_xml}</bioghist>'
+                                        bioghist_elements.append(bioghist_el)
+                    except Exception as e:
+                        self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')
+
+        if bioghist_elements:
+            return ''.join(bioghist_elements)
+        return None
+
+
     def get_repo_id(self, repo):
         """
         Get the repository ID from the repository URI.