Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ vendor/
.env
*.log
output/
.vscode/
.vscode/
.idea
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,13 @@

## ---- This is the Terraform-generated header for carbon-dev. ---- ## \
If this is a Lambda repo, uncomment the FUNCTION line below \
and review the other commented lines in the document.
and review the other commented lines in the document.
ECR_NAME_DEV:=carbon-dev
ECR_URL_DEV:=222053980223.dkr.ecr.us-east-1.amazonaws.com/carbon-dev
# FUNCTION_DEV:=
## ---- End of Terraform-generated header ---- ##

SHELL=/bin/bash
S3_BUCKET:=shared-files-$(shell aws sts get-caller-identity --query "Account" --output text)
ORACLE_ZIP:=instantclient-basiclite-linux.x64-21.9.0.0.0dbru.zip
DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ)

Expand All @@ -29,7 +28,8 @@ update: install # update all python dependencies
pipenv update --dev

dependencies: # download Oracle instant client zip
aws s3 cp s3://$(S3_BUCKET)/files/$(ORACLE_ZIP) vendor/$(ORACLE_ZIP)
S3_BUCKET=shared-files-$$(aws sts get-caller-identity --query "Account" --output text); \
aws s3 cp s3://$$S3_BUCKET/files/$(ORACLE_ZIP) vendor/$(ORACLE_ZIP)

## ---- Unit test commands ---- ##

Expand Down
1,261 changes: 639 additions & 622 deletions Pipfile.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,5 @@ SNS_TOPIC="<VALID_SNS_TOPIC_ARN>" # SNS topic ARN used for sending email notific
LOG_LEVEL="INFO" # The log level for the 'carbon' application. Defaults to 'INFO' if not set.
ORACLE_LIB_DIR="<PATH>" # The directory containing the Oracle Instant Client library.
SENTRY_DSN="<SENTRY_DSN>" # If set to a valid Sentry DSN, enables Sentry exception monitoring. This is not needed for local development.
ARTICLES_PUBLISH_DAYS_PAST= # If set, limits the Articles data warehouse query to a PUBLISH_DATE >= to this many days old, noting publish dates can also be in the future.
```
2 changes: 2 additions & 0 deletions carbon/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def write(self, feed_type: str) -> None:
elif feed_type == "articles":
xml_feed = ArticlesXmlFeed(engine=self.engine, output_file=self.output_file)
xml_feed.run()
else:
raise ValueError(f"Feed type not recognized: '{feed_type}'")

logger.info(
"The '%s' feed has processed %s records.",
Expand Down
2 changes: 1 addition & 1 deletion carbon/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def main(*, output_file: IO, run_connection_tests: bool, use_sns_logging: bool)
try:
pipe.run()
except Exception as error: # noqa: BLE001
logger.error("Carbon run has failed.") # noqa: TRY400
logger.error(f"Carbon run has failed: {error}") # noqa: TRY400
if use_sns_logging:
sns_log(config=config, status="fail", error=error)
else:
Expand Down
12 changes: 10 additions & 2 deletions carbon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@

class Config:
REQUIRED_ENVIRONMENT_VARIABLES: Iterable[str] = (
"FEED_TYPE",
"DATAWAREHOUSE_CLOUDCONNECTOR_JSON",
"FEED_TYPE",
"SYMPLECTIC_FTP_JSON",
"SYMPLECTIC_FTP_PATH",
"SNS_TOPIC_ARN",
"WORKSPACE",
)
OPTIONAL_ENVIRONMENT_VARIABLES: Iterable[str] = ("ARTICLES_PUBLISH_DAYS_PAST",)

ARTICLES_PUBLISH_DAYS_PAST: str | None = None
FEED_TYPE: str
CONNECTION_STRING: str
SYMPLECTIC_FTP_USER: str
Expand Down Expand Up @@ -75,7 +78,8 @@ def configure_sentry(self) -> None:
root_logger.info("No Sentry DSN found, exceptions will not be sent to Sentry")

def load_environment_variables(self) -> None:
"""Retrieve required environment variables and populate instance attributes."""
"""Retrieve environment variables and populate instance attributes."""
# check and set REQUIRED env vars
for config_variable in self.REQUIRED_ENVIRONMENT_VARIABLES:
try:
if config_variable in [
Expand All @@ -94,3 +98,7 @@ def load_environment_variables(self) -> None:
config_variable,
)
raise

# set OPTIONAL env vars
for config_variable in self.OPTIONAL_ENVIRONMENT_VARIABLES:
setattr(self, config_variable, os.environ.get(config_variable))
1 change: 1 addition & 0 deletions carbon/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
Column("JOURNAL_VOLUME", Unicode),
Column("MIT_ID", String),
Column("PUBLISHER", Unicode),
Column("PUBLISH_DATE", String),
)


Expand Down
257 changes: 141 additions & 116 deletions carbon/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from typing import IO, Any, ClassVar

from lxml import etree as ET
from sqlalchemy import func, select
from sqlalchemy import func, select, text
from sqlalchemy.sql.selectable import Select

from carbon.config import Config
from carbon.database import DatabaseEngine, aa_articles, dlcs, orcids, persons
from carbon.helpers import (
get_group_name,
Expand Down Expand Up @@ -107,13 +108,35 @@ class ArticlesXmlFeed(BaseXmlFeed):
"""Articles XML feed class."""

root_element_name = "ARTICLES"
query = (
select(aa_articles)
.where(aa_articles.c.ARTICLE_ID.is_not(None))
.where(aa_articles.c.ARTICLE_TITLE.is_not(None))
.where(aa_articles.c.DOI.is_not(None))
.where(aa_articles.c.MIT_ID.is_not(None))
)

@property
def query(self) -> Select: # type: ignore[override]
Comment on lines +112 to +113
Copy link
Contributor Author

@ghukill ghukill Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving into a @property allows for two things:

  1. a bit of logic during construction
  2. instantiation of a Config() object after the testing harness and env vars are setup

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, what do you mean by "testing harness"? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, could have been more clear.

We set env vars in conftest.py as part of the _test_env fixture, but these aren't set until after imports take place in our files.

Originally, I had a:

config = Config()

at the top of feed.py, but this failed because the "testing harness" -- which includes the fixtures and generally anything else you'd expect to be "ready" for testing -- was not fully ready, and so the required env vars weren't set.

It worked locally when I had env vars set, and it would have worked in prod where they are also set, but not for testing.

"""Build data warehouse query for Articles.

If the env var "ARTICLES_PUBLISH_DAYS_PAST" is set, filter the query to rows
where PUBLISH_DATE is >= than this many days old. Notethat the PUBLISH_DATE can
be in the future, so an article may be included multiple times in the XML output
until its future date has passed.
"""
config = Config()

query_object = (
select(aa_articles)
.where(aa_articles.c.ARTICLE_ID.is_not(None))
.where(aa_articles.c.ARTICLE_TITLE.is_not(None))
.where(aa_articles.c.DOI.is_not(None))
.where(aa_articles.c.MIT_ID.is_not(None))
)

if config.ARTICLES_PUBLISH_DAYS_PAST:
query_object = query_object.where(
text(
"TO_DATE(PUBLISH_DATE, 'MM/DD/YYYY') >= "
f"SYSDATE - {int(config.ARTICLES_PUBLISH_DAYS_PAST)}"
)
)

return query_object

def _add_element(self, record: dict[str, Any]) -> ET._Element:
"""Create an XML element representing an article.
Expand Down Expand Up @@ -163,119 +186,121 @@ class PeopleXmlFeed(BaseXmlFeed):
attribute of the root 'records' element when serialized.
"""

areas: tuple[str, ...] = (
"ARCHITECTURE & PLANNING AREA",
"ENGINEERING AREA",
"HUMANITIES, ARTS, & SOCIAL SCIENCES AREA",
"SCIENCE AREA",
"SLOAN SCHOOL OF MANAGEMENT AREA",
"VP RESEARCH",
"CHANCELLOR'S AREA",
"OFFICE OF PROVOST AREA",
"PROVOST AREA",
)
ps_codes: tuple[str, ...] = (
"CFAN",
"CFAT",
"CFEL",
"CSRS",
"CSRR",
"COAC",
"COAR",
"L303",
)
titles: tuple[str, ...] = (
"ADJUNCT ASSOCIATE PROFESSOR",
"ADJUNCT PROFESSOR",
"AFFILIATED ARTIST",
"ASSISTANT PROFESSOR",
"ASSOCIATE PROFESSOR",
"ASSOCIATE PROFESSOR (NOTT)",
"ASSOCIATE PROFESSOR (WOT)",
"ASSOCIATE PROFESSOR OF THE PRACTICE",
"INSTITUTE OFFICIAL - EMERITUS",
"INSTITUTE PROFESSOR (WOT)",
"INSTITUTE PROFESSOR EMERITUS",
"INSTRUCTOR",
"LECTURER",
"LECTURER II",
"POSTDOCTORAL ASSOCIATE",
"POSTDOCTORAL FELLOW",
"PRINCIPAL RESEARCH ASSOCIATE",
"PRINCIPAL RESEARCH ENGINEER",
"PRINCIPAL RESEARCH SCIENTIST",
"PROFESSOR",
"PROFESSOR (NOTT)",
"PROFESSOR (WOT)",
"PROFESSOR EMERITUS",
"PROFESSOR OF THE PRACTICE",
"RESEARCH ASSOCIATE",
"RESEARCH ENGINEER",
"RESEARCH FELLOW",
"RESEARCH SCIENTIST",
"RESEARCH SPECIALIST",
"SENIOR LECTURER",
"SENIOR POSTDOCTORAL ASSOCIATE",
"SENIOR POSTDOCTORAL FELLOW",
"SENIOR RESEARCH ASSOCIATE",
"SENIOR RESEARCH ENGINEER",
"SENIOR RESEARCH SCIENTIST",
"SENIOR RESEARCH SCIENTIST (MAP)",
"SPONSORED RESEARCH TECHNICAL STAFF",
"SPONSORED RESEARCH TECHNICAL SUPERVISOR",
"STAFF AFFILIATE",
"TECHNICAL ASSISTANT",
"TECHNICAL ASSOCIATE",
"VISITING ASSISTANT PROFESSOR",
"VISITING ASSOCIATE PROFESSOR",
"VISITING ENGINEER",
"VISITING LECTURER",
"VISITING PROFESSOR",
"VISITING RESEARCH ASSOCIATE",
"VISITING SCHOLAR",
"VISITING SCIENTIST",
"VISITING SENIOR LECTURER",
"PART-TIME FLEXIBLE/LL",
)

symplectic_elements_namespace: str = "http://www.symplectic.co.uk/hrimporter"
namespace_mapping: ClassVar[dict] = {None: symplectic_elements_namespace}

root_element_name: str = str(ET.QName(symplectic_elements_namespace, tag="records"))
query = (
select(
persons.c.MIT_ID,
persons.c.KRB_NAME_UPPERCASE,
persons.c.FIRST_NAME,
persons.c.MIDDLE_NAME,
persons.c.LAST_NAME,
persons.c.EMAIL_ADDRESS,
persons.c.DATE_TO_FACULTY,
persons.c.ORIGINAL_HIRE_DATE,
dlcs.c.DLC_NAME,
persons.c.PERSONNEL_SUBAREA_CODE,
persons.c.APPOINTMENT_END_DATE,
orcids.c.ORCID,
dlcs.c.ORG_HIER_SCHOOL_AREA_NAME,
dlcs.c.HR_ORG_LEVEL5_NAME,

@property
def query(self) -> Select: # type: ignore[override]
Comment on lines +193 to +194
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved this into a @property to be consistent with ArticlesXmlFeed.

areas: tuple[str, ...] = (
"ARCHITECTURE & PLANNING AREA",
"ENGINEERING AREA",
"HUMANITIES, ARTS, & SOCIAL SCIENCES AREA",
"SCIENCE AREA",
"SLOAN SCHOOL OF MANAGEMENT AREA",
"VP RESEARCH",
"CHANCELLOR'S AREA",
"OFFICE OF PROVOST AREA",
"PROVOST AREA",
)
.select_from(persons)
.outerjoin(orcids)
.join(dlcs)
.where(persons.c.EMAIL_ADDRESS.is_not(None))
.where(persons.c.LAST_NAME.is_not(None))
.where(persons.c.KRB_NAME_UPPERCASE.is_not(None))
.where(persons.c.KRB_NAME_UPPERCASE != "UNKNOWN")
.where(persons.c.MIT_ID.is_not(None))
.where(persons.c.ORIGINAL_HIRE_DATE.is_not(None))
.where(
persons.c.APPOINTMENT_END_DATE # noqa: SIM300
>= datetime(2009, 1, 1) # noqa: DTZ001
ps_codes: tuple[str, ...] = (
"CFAN",
"CFAT",
"CFEL",
"CSRS",
"CSRR",
"COAC",
"COAR",
"L303",
)
titles: tuple[str, ...] = (
"ADJUNCT ASSOCIATE PROFESSOR",
"ADJUNCT PROFESSOR",
"AFFILIATED ARTIST",
"ASSISTANT PROFESSOR",
"ASSOCIATE PROFESSOR",
"ASSOCIATE PROFESSOR (NOTT)",
"ASSOCIATE PROFESSOR (WOT)",
"ASSOCIATE PROFESSOR OF THE PRACTICE",
"INSTITUTE OFFICIAL - EMERITUS",
"INSTITUTE PROFESSOR (WOT)",
"INSTITUTE PROFESSOR EMERITUS",
"INSTRUCTOR",
"LECTURER",
"LECTURER II",
"POSTDOCTORAL ASSOCIATE",
"POSTDOCTORAL FELLOW",
"PRINCIPAL RESEARCH ASSOCIATE",
"PRINCIPAL RESEARCH ENGINEER",
"PRINCIPAL RESEARCH SCIENTIST",
"PROFESSOR",
"PROFESSOR (NOTT)",
"PROFESSOR (WOT)",
"PROFESSOR EMERITUS",
"PROFESSOR OF THE PRACTICE",
"RESEARCH ASSOCIATE",
"RESEARCH ENGINEER",
"RESEARCH FELLOW",
"RESEARCH SCIENTIST",
"RESEARCH SPECIALIST",
"SENIOR LECTURER",
"SENIOR POSTDOCTORAL ASSOCIATE",
"SENIOR POSTDOCTORAL FELLOW",
"SENIOR RESEARCH ASSOCIATE",
"SENIOR RESEARCH ENGINEER",
"SENIOR RESEARCH SCIENTIST",
"SENIOR RESEARCH SCIENTIST (MAP)",
"SPONSORED RESEARCH TECHNICAL STAFF",
"SPONSORED RESEARCH TECHNICAL SUPERVISOR",
"STAFF AFFILIATE",
"TECHNICAL ASSISTANT",
"TECHNICAL ASSOCIATE",
"VISITING ASSISTANT PROFESSOR",
"VISITING ASSOCIATE PROFESSOR",
"VISITING ENGINEER",
"VISITING LECTURER",
"VISITING PROFESSOR",
"VISITING RESEARCH ASSOCIATE",
"VISITING SCHOLAR",
"VISITING SCIENTIST",
"VISITING SENIOR LECTURER",
"PART-TIME FLEXIBLE/LL",
)

return (
select(
persons.c.MIT_ID,
persons.c.KRB_NAME_UPPERCASE,
persons.c.FIRST_NAME,
persons.c.MIDDLE_NAME,
persons.c.LAST_NAME,
persons.c.EMAIL_ADDRESS,
persons.c.DATE_TO_FACULTY,
persons.c.ORIGINAL_HIRE_DATE,
dlcs.c.DLC_NAME,
persons.c.PERSONNEL_SUBAREA_CODE,
persons.c.APPOINTMENT_END_DATE,
orcids.c.ORCID,
dlcs.c.ORG_HIER_SCHOOL_AREA_NAME,
dlcs.c.HR_ORG_LEVEL5_NAME,
)
.select_from(persons)
.outerjoin(orcids)
.join(dlcs)
.where(persons.c.EMAIL_ADDRESS.is_not(None))
.where(persons.c.LAST_NAME.is_not(None))
.where(persons.c.KRB_NAME_UPPERCASE.is_not(None))
.where(persons.c.KRB_NAME_UPPERCASE != "UNKNOWN")
.where(persons.c.MIT_ID.is_not(None))
.where(persons.c.ORIGINAL_HIRE_DATE.is_not(None))
.where(
persons.c.APPOINTMENT_END_DATE # noqa: SIM300
>= datetime(2009, 1, 1) # noqa: DTZ001
)
.where(func.upper(dlcs.c.ORG_HIER_SCHOOL_AREA_NAME).in_(areas))
.where(persons.c.PERSONNEL_SUBAREA_CODE.in_(ps_codes))
.where(func.upper(persons.c.JOB_TITLE).in_(titles))
)
.where(func.upper(dlcs.c.ORG_HIER_SCHOOL_AREA_NAME).in_(areas))
.where(persons.c.PERSONNEL_SUBAREA_CODE.in_(ps_codes))
.where(func.upper(persons.c.JOB_TITLE).in_(titles))
)

def _add_element(self, record: dict[str, Any]) -> ET._Element:
"""Create an XML element representing a person.
Expand Down
Loading