diff --git a/.env.sample b/.env.sample
index 5e1c6ad61..d4e8b946c 100644
--- a/.env.sample
+++ b/.env.sample
@@ -1,2 +1,3 @@
BLACK_SWAN_DATABASE_URL=postgres://localhost/black-swan-development?sslmode=disable
DRAFTS=true
+LOCAL_FONTS=true
diff --git a/.forego b/.forego
new file mode 100644
index 000000000..87060cb99
--- /dev/null
+++ b/.forego
@@ -0,0 +1 @@
+port: 5002
diff --git a/.gitignore b/.gitignore
index 608a4b2ab..36f240b09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
public/
+public-dev/
+*.csv
+.DS_Store
.env
+.envrc
tags
diff --git a/.travis.yml b/.travis.yml
index 422d7c833..5227fdf51 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
language: go
go:
- - 1.6
+ - 1.9
# magic word to use faster/newer container-based architecture
sudo: false
@@ -14,7 +14,13 @@ install:
script:
- VERBOSE=true make
- - make deploy
+
+ # deploy development version that includes drafts
+ - TARGET_DIR=./public-dev DRAFTS=true make build
+ - TARGET_DIR=./public-dev S3_BUCKET=brandur.org-dev make deploy
+
+ # then deploy production
+ - TARGET_DIR=./public S3_BUCKET=brandur.org make deploy
notifications:
email:
@@ -24,7 +30,14 @@ notifications:
# automatically.
env:
global:
- - S3_BUCKET=brandur.org
+ - CLOUDFRONT_ID=E2D97SPIHRBCUA
+ - GOOGLE_ANALYTICS_ID=UA-47798518-1
+
+ # Use only a single Goroutine so that we can debug problems that arise more
+ # easily. The build in CI is already lengthy due to system setup, and
+ # shaving a few hundred milliseconds off with more concurrency makes almost
+ # no difference at all.
+ - CONCURRENCY=1
# $AWS_ACCESS_KEY_ID
- secure: "N7O3iKGp51NBnkrgAjNrVbtV2sxmiG8NY07oNVvkhzPRhdOSB+nXwhEk+K3Bz7AiC+2WGXBZRx2jvjDu3w3L9k4NnsEFtOlt+7BBcDUHt2xqLkQNBTj5k51F+3oCJPoBEzwL7Cb8YAzFhcVyrgp903R1oHy5F+fs+ICRkapID92/6yCpFt86O7tT6u7jsQuEtmc/uT5EQzy3h7A1zNnIReouHGx3ZLNKqdTObHrKHV2IILR9pI7TF/BZLHQ20uyQheoIz8B0MeGpPOK4AFNTt8L/qEp89HvymjFY90d40o1MXYPlOvDktYMXyJhtgMiDBkLgDyx8K7WMCMewoVj4bmBeyJSDrItoE6ErNCNYZccC0Zc2W9ePZOfxnDFNazNEbWeNutcSJbavndZ08tCqTyy8aUwsHOMtsAUWAqlVxUCx8f4brMpmjcRXrM8bEcUc1O+27yMk6Oixm+1I++CcGfLUV78PwgeTv0LZHRzqcggLh+vAUdN/AR4/Co1T0rcg2xeDFF4IqOTW9Y/3RV8/kL2CAgkkBy6WE4BgX2vIzsH/COj23wYq2OPFS2pRjopqBvraqNIJQ3GzZuQ2O39695XzRtwYCG9Hy/BOvWVMWBdttB/WqZL/YBXEoVq893UNAV5L7xfvZ5K+0HFfpwn36rtfkz6MeZ2mWNKjv3uA3/4="
@@ -34,3 +47,9 @@ env:
# $BLACK_SWAN_DATABASE_URL
- secure: "obYt+fAcDKox/K6zN7cJsAqW2zZ/wGBfpyhZIP+hoGXWcyMqzby+bdK9SIczePALQz0ihtuabHa5rP4mPm/0A809Q6ccP4WeQRPNoqqeWwX9YYDeU7dp0C/bOxM/7ow4G+9FUaNp5hVlqZ4IKnVF3XmQkbxB4PsnCH0cV/qsF9foe1W7OWtCOLYWNTpfPbl8r6tfcl+MCzAn+SyBiLbLuVOPUhOTe7V1eCDPPpuC/qrA9djJ6fThrcyU9qeYCXMKU/8KdFIUGo96A2ro3JHfixaRSniupcaBPMvTRNcrd4CBhyfZ61PQEIos+GvJQiDX8jysg3UnASS2w3o2meZx4En2G4NCUEki0gq1plNuQIxDHpwpMr97k+WW4OpS05ZzwUBosOry1h94ppzkSNrsvVT2E+GKiieLXIj/dkawWsmiYs4VIRHVBgk1cjDcKaJm2FNYjgOcoYISvpw7aQYvIfvO9+LfeLCu2zAjw0ZZ0EaFO5AlHOogy9GAQZyaUrMmFl5I4QcZdD9hH3Dil0m7zo5RWWblyvhc4oQwhVBDg95kFpxd6ojAZ6IhjMRy8umtd/LSt0GBm9uW1Ci6CMV0H0IAySvZqtuHWjyXnByLhxFp6W5ujRlRCxGMRikw5zZ69trAT9PJcZEQ6uyB8b2XqRXQ48ErkbJ41AlRx6dWcOE="
+
+services:
+ - postgresql
+
+before_script:
+ - createdb sorg-test
diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json
deleted file mode 100644
index bf77ba717..000000000
--- a/Godeps/Godeps.json
+++ /dev/null
@@ -1,88 +0,0 @@
-{
- "ImportPath": "github.com/brandur/sorg",
- "GoVersion": "go1.6",
- "GodepVersion": "v74",
- "Packages": [
- "github.com/brandur/sorg",
- "github.com/brandur/sorg/assets",
- "github.com/brandur/sorg/cmd/sorg-build",
- "github.com/brandur/sorg/cmd/sorg-serve",
- "github.com/brandur/sorg/markdown",
- "github.com/brandur/sorg/templatehelpers",
- "github.com/brandur/sorg/toc"
- ],
- "Deps": [
- {
- "ImportPath": "github.com/Sirupsen/logrus",
- "Comment": "v0.8.3-112-g219c8cb",
- "Rev": "219c8cb75c258c552e999735be6df753ffc7afdc"
- },
- {
- "ImportPath": "github.com/davecgh/go-spew/spew",
- "Rev": "5215b55f46b2b919f50a1df0eaa5886afe4e3b3d"
- },
- {
- "ImportPath": "github.com/joeshaw/envdecode",
- "Rev": "573d2abb11d7ad19d922aa80d4b7870225c20911"
- },
- {
- "ImportPath": "github.com/lib/pq",
- "Comment": "go1.0-cutoff-107-g4dd446e",
- "Rev": "4dd446efc17690bc53e154025146f73203b18309"
- },
- {
- "ImportPath": "github.com/lib/pq/oid",
- "Comment": "go1.0-cutoff-107-g4dd446e",
- "Rev": "4dd446efc17690bc53e154025146f73203b18309"
- },
- {
- "ImportPath": "github.com/pmezard/go-difflib/difflib",
- "Rev": "792786c7400a136282c1664665ae0a8db921c6c2"
- },
- {
- "ImportPath": "github.com/russross/blackfriday",
- "Comment": "v1.4-33-g1d6b8e9",
- "Rev": "1d6b8e9301e720b08a8938b8c25c018285885438"
- },
- {
- "ImportPath": "github.com/shurcooL/sanitized_anchor_name",
- "Rev": "10ef21a441db47d8b13ebcc5fd2310f636973c77"
- },
- {
- "ImportPath": "github.com/stretchr/testify/assert",
- "Comment": "v1.0-17-g089c718",
- "Rev": "089c7181b8c728499929ff09b62d3fdd8df8adff"
- },
- {
- "ImportPath": "github.com/stretchr/testify/require",
- "Comment": "v1.0-17-g089c718",
- "Rev": "089c7181b8c728499929ff09b62d3fdd8df8adff"
- },
- {
- "ImportPath": "github.com/yosssi/ace",
- "Comment": "v0.0.4-51-g71afeb7",
- "Rev": "71afeb714739f9d5f7e1849bcd4a0a5938e1a70d"
- },
- {
- "ImportPath": "github.com/yosssi/gcss",
- "Comment": "v0.1.0-2-g3967759",
- "Rev": "39677598ea4f3ec1da5568173b4d43611f307edb"
- },
- {
- "ImportPath": "golang.org/x/net/html",
- "Rev": "b400c2eff1badec7022a8c8f5bea058b6315eed7"
- },
- {
- "ImportPath": "golang.org/x/net/html/atom",
- "Rev": "b400c2eff1badec7022a8c8f5bea058b6315eed7"
- },
- {
- "ImportPath": "golang.org/x/sys/unix",
- "Rev": "320cb01ddbbf0473674c2585f9b6e245721de355"
- },
- {
- "ImportPath": "gopkg.in/yaml.v2",
- "Rev": "a83829b6f1293c91addabc89d0571c246397bbf4"
- }
- ]
-}
diff --git a/Gopkg.lock b/Gopkg.lock
new file mode 100644
index 000000000..0cbb31022
--- /dev/null
+++ b/Gopkg.lock
@@ -0,0 +1,129 @@
+# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
+
+
+[[projects]]
+ name = "github.com/PuerkitoBio/goquery"
+ packages = ["."]
+ revision = "e1271ee34c6a305e38566ecd27ae374944907ee9"
+ version = "v1.1.0"
+
+[[projects]]
+ name = "github.com/Sirupsen/logrus"
+ packages = ["."]
+ revision = "f006c2ac4710855cf0f916dd6b77acf6b048dc6e"
+ version = "v1.0.3"
+
+[[projects]]
+ branch = "master"
+ name = "github.com/andybalholm/cascadia"
+ packages = ["."]
+ revision = "349dd0209470eabd9514242c688c403c0926d266"
+
+[[projects]]
+ name = "github.com/aymerick/douceur"
+ packages = ["css","inliner","parser"]
+ revision = "c5c95ec357c8235fbd7f34e8c843d36783f3fad9"
+ version = "v0.2.0"
+
+[[projects]]
+ name = "github.com/davecgh/go-spew"
+ packages = ["spew"]
+ revision = "346938d642f2ec3594ed81d874461961cd0faa76"
+ version = "v1.1.0"
+
+[[projects]]
+ branch = "master"
+ name = "github.com/gorilla/css"
+ packages = ["scanner"]
+ revision = "398b0b046082ecb3694c01bec6b336a06a4e530a"
+
+[[projects]]
+ branch = "master"
+ name = "github.com/joeshaw/envdecode"
+ packages = ["."]
+ revision = "6326cbed175e32cd5183be1cc1027e0823c91edb"
+
+[[projects]]
+ branch = "master"
+ name = "github.com/lib/pq"
+ packages = [".","oid"]
+ revision = "e42267488fe361b9dc034be7a6bffef5b195bceb"
+
+[[projects]]
+ name = "github.com/pkg/errors"
+ packages = ["."]
+ revision = "645ef00459ed84a119197bfb8d8205042c6df63d"
+ version = "v0.8.0"
+
+[[projects]]
+ name = "github.com/pmezard/go-difflib"
+ packages = ["difflib"]
+ revision = "792786c7400a136282c1664665ae0a8db921c6c2"
+ version = "v1.0.0"
+
+[[projects]]
+ name = "github.com/russross/blackfriday"
+ packages = ["."]
+ revision = "cadec560ec52d93835bf2f15bd794700d3a2473b"
+ version = "v2.0.0"
+
+[[projects]]
+ branch = "master"
+ name = "github.com/shurcooL/sanitized_anchor_name"
+ packages = ["."]
+ revision = "541ff5ee47f1dddf6a5281af78307d921524bcb5"
+
+[[projects]]
+ name = "github.com/stretchr/testify"
+ packages = ["assert","require"]
+ revision = "69483b4bd14f5845b5a1e55bca19e954e827f1d0"
+ version = "v1.1.4"
+
+[[projects]]
+ name = "github.com/yosssi/ace"
+ packages = ["."]
+ revision = "ea038f4770b6746c3f8f84f14fa60d9fe1205b56"
+ version = "v0.0.5"
+
+[[projects]]
+ name = "github.com/yosssi/gcss"
+ packages = ["."]
+ revision = "4dc500a9960c4bc1224c28c9695f6447b1d14d99"
+ version = "v0.1.0"
+
+[[projects]]
+ branch = "master"
+ name = "golang.org/x/crypto"
+ packages = ["ssh/terminal"]
+ revision = "81e90905daefcd6fd217b62423c0908922eadb30"
+
+[[projects]]
+ branch = "master"
+ name = "golang.org/x/net"
+ packages = ["html","html/atom"]
+ revision = "66aacef3dd8a676686c7ae3716979581e8b03c47"
+
+[[projects]]
+ branch = "master"
+ name = "golang.org/x/sys"
+ packages = ["unix","windows"]
+ revision = "7ddbeae9ae08c6a06a59597f0c9edbc5ff2444ce"
+
+[[projects]]
+ name = "gopkg.in/mailgun/mailgun-go.v1"
+ packages = ["."]
+ revision = "a4002e2df2e8ca2da6a6fbb4a72871b504e49f50"
+ version = "v1.1.0"
+
+[[projects]]
+ branch = "v2"
+ name = "gopkg.in/yaml.v2"
+ packages = ["."]
+ revision = "eb3733d160e74a9c7e442f435eb3bea458e1d19f"
+
+[solve-meta]
+ analyzer-name = "dep"
+ analyzer-version = 1
+ inputs-digest = "364a01e866f7c603b2ad072835484ebdfd99dfb544fc693cdda2797ba3a0e071"
+ solver-name = "gps-cdcl"
+ solver-version = 1
diff --git a/Gopkg.toml b/Gopkg.toml
new file mode 100644
index 000000000..0fb9d1143
--- /dev/null
+++ b/Gopkg.toml
@@ -0,0 +1,66 @@
+
+# Gopkg.toml example
+#
+# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
+# for detailed Gopkg.toml documentation.
+#
+# required = ["github.com/user/thing/cmd/thing"]
+# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"]
+#
+# [[constraint]]
+# name = "github.com/user/project"
+# version = "1.0.0"
+#
+# [[constraint]]
+# name = "github.com/user/project2"
+# branch = "dev"
+# source = "github.com/myfork/project2"
+#
+# [[override]]
+# name = "github.com/x/y"
+# version = "2.4.0"
+
+
+[[constraint]]
+ name = "github.com/Sirupsen/logrus"
+ version = "1.0.3"
+
+[[constraint]]
+ name = "github.com/aymerick/douceur"
+ version = "0.2.0"
+
+[[constraint]]
+ branch = "master"
+ name = "github.com/joeshaw/envdecode"
+
+[[constraint]]
+ branch = "master"
+ name = "github.com/lib/pq"
+
+[[constraint]]
+ name = "github.com/russross/blackfriday"
+ version = "2.0.0"
+
+[[constraint]]
+ name = "github.com/stretchr/testify"
+ version = "1.1.4"
+
+[[constraint]]
+ name = "github.com/yosssi/ace"
+ version = "0.0.5"
+
+[[constraint]]
+ name = "github.com/yosssi/gcss"
+ version = "0.1.0"
+
+[[constraint]]
+ branch = "master"
+ name = "golang.org/x/net"
+
+[[constraint]]
+ name = "gopkg.in/mailgun/mailgun-go.v1"
+ version = "1.1.0"
+
+[[constraint]]
+ branch = "v2"
+ name = "gopkg.in/yaml.v2"
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..a1931e5bc
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Brandur Leach
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
index 8d0d93191..f111db54a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-all: clean install test vet lint check-gofmt build
+all: clean install test vet lint check-gofmt check-headers check-retina build
build:
$(GOPATH)/bin/sorg-build
@@ -6,11 +6,28 @@ build:
check-gofmt:
scripts/check_gofmt.sh
+check-headers:
+ scripts/check_headers.sh
+
+check-retina:
+ scripts/check_retina.sh
+
clean:
mkdir -p public/
rm -f -r public/*
-deploy: build
+# Long TTL (in seconds) to set on an object in S3. This is suitable for items
+# that we expect to only have to invalidate very rarely like images. Although
+# we set it for all assets, those that are expected to change more frequently
+# like script or stylesheet files are versioned by a path that can be set at
+# build time.
+LONG_TTL := 86400
+
+# Short TTL (in seconds) to set on an object in S3. This is suitable for items
+# that are expected to change more frequently like any HTML file.
+SHORT_TTL := 3600
+
+deploy: build check-target-dir
# Note that AWS_ACCESS_KEY_ID will only be set for builds on the master
# branch because it's stored in `.travis.yml` as an encrypted variable.
# Encrypted variables are not made available to non-master branches because
@@ -23,13 +40,13 @@ ifdef AWS_ACCESS_KEY_ID
# Note that we don't delete because it could result in a race condition in
# that files that are uploaded with special directives below could be
# removed even while the S3 bucket is actively in-use.
- aws s3 sync ./public/ s3://$(S3_BUCKET)/ --acl public-read --content-type text/html --exclude 'assets*' $(AWS_CLI_FLAGS)
+ aws s3 sync $(TARGET_DIR) s3://$(S3_BUCKET)/ --acl public-read --cache-control max-age=$(SHORT_TTL) --content-type text/html --exclude 'assets*' --quiet $(AWS_CLI_FLAGS)
# Then move on to assets and allow S3 to detect content type.
- aws s3 sync ./public/assets/ s3://$(S3_BUCKET)/assets/ --acl public-read --delete --follow-symlinks $(AWS_CLI_FLAGS)
+ aws s3 sync $(TARGET_DIR)/assets/ s3://$(S3_BUCKET)/assets/ --acl public-read --cache-control max-age=$(LONG_TTL) --delete --follow-symlinks --quiet $(AWS_CLI_FLAGS)
# Upload Atom feed files with their proper content type.
- find ./public -name '*.atom' | sed "s|^\./public/||" | xargs -I{} -n1 aws s3 cp ./public/{} s3://$(S3_BUCKET)/{} --acl public-read --content-type application/xml
+ find $(TARGET_DIR) -name '*.atom' | sed "s|^\$(TARGET_DIR)/||" | xargs -I{} -n1 aws s3 cp $(TARGET_DIR)/{} s3://$(S3_BUCKET)/{} --acl public-read --cache-control max-age=$(SHORT_TTL) --content-type application/xml
# This one is a bit tricker to explain, but what we're doing here is
# uploading directory indexes as files at their directory name. So for
@@ -46,43 +63,101 @@ ifdef AWS_ACCESS_KEY_ID
# directory cannot share a name.
# 2. The `index.html` files are useful for emulating a live server locally:
# Golang's http.FileServer will respect them as indexes.
- find ./public -name index.html | egrep -v './public/index.html' | sed "s|^\./public/||" | xargs -I{} -n1 dirname {} | xargs -I{} -n1 aws s3 cp ./public/{}/index.html s3://$(S3_BUCKET)/{} --acl public-read --content-type text/html
+ find $(TARGET_DIR) -name index.html | egrep -v '$(TARGET_DIR)/index.html' | sed "s|^$(TARGET_DIR)/||" | xargs -I{} -n1 dirname {} | xargs -I{} -n1 aws s3 cp $(TARGET_DIR)/{}/index.html s3://$(S3_BUCKET)/{} --acl public-read --cache-control max-age=$(SHORT_TTL) --content-type text/html
+
+ # Give robots.txt (if it exists) a Content-Type of text/plain. Twitter is
+ # rabid about this.
+ [ -f $(TARGET_DIR)/robots.txt ] && aws s3 cp $(TARGET_DIR)/robots.txt s3://$(S3_BUCKET)/ --acl public-read --cache-control max-age=$(SHORT_TTL) --content-type text/plain --quiet $(AWS_CLI_FLAGS) || echo "no robots.txt"
else
# No AWS access key. Skipping deploy.
endif
install:
- go install $(shell go list ./... | egrep -v '/org/|/vendor/')
-
+ go install ./...
+
+# Invalidates CloudFront's cache for paths specified in PATHS.
+#
+# Usage:
+# make PATHS="/fragments /fragments/six-weeks" invalidate
+invalidate: check-aws-keys check-cloudfront-id
+ifndef PATHS
+ $(error PATHS is required)
+endif
+ aws cloudfront create-invalidation --distribution-id $(CLOUDFRONT_ID) --paths ${PATHS}
+
+# Invalidates CloudFront's entire cache.
+invalidate-all: check-aws-keys check-cloudfront-id
+ aws cloudfront create-invalidation --distribution-id $(CLOUDFRONT_ID) --paths /
+
+# Invalidates CloudFront's cached assets.
+invalidate-assets: check-aws-keys check-cloudfront-id
+ aws cloudfront create-invalidation --distribution-id $(CLOUDFRONT_ID) --paths /assets
+
+# Invalidates CloudFront's cached index pages. This is useful, but not
+# necessarily required, when publishing articles or new data (if it's not run,
+# anything cached in CloudFront will expire naturally after SHORT_TTL).
+invalidate-indexes: check-aws-keys check-cloudfront-id
+ aws cloudfront create-invalidation --distribution-id $(CLOUDFRONT_ID) --paths /articles /articles.atom /fragments /fragments.atom /photos /reading /runs /twitter
+
+# Note that unfortunately Golint doesn't work like other Go commands: it only
+# takes only a single argument at a time and expects that each is the name of a
+# local directory (as opposed to a package).
+#
+# The exit 255 trick ensures that xargs will actually bubble a failure back up
+# to the entire command.
lint:
- $(GOPATH)/bin/golint -set_exit_status
-
-save:
- godep save $(shell go list ./... | egrep -v '/org/|/vendor/')
+ go list ./... | xargs -I{} -n1 sh -c '$(GOPATH)/bin/golint -set_exit_status {} || exit 255'
serve:
$(GOPATH)/bin/sorg-serve
test:
- go test $(shell go list ./... | egrep -v '/org/|/vendor/')
+ psql postgres://localhost/sorg-test < testing/black_swan.sql > /dev/null
+ go test ./...
vet:
- go vet $(shell go list ./... | egrep -v '/org/|/vendor/')
+ go vet ./...
# Note that we use the CONTENT_ONLY flag on the build here. We're watching for
# changes in content directories, so don't bother rebuilding pages generated
# from Black Swan data.
watch:
- fswatch -o layouts/ pages/ org/ views/ | CONTENT_ONLY=true xargs -n1 -I{} make build
+ fswatch -o content/ layouts/ pages/ views/ | CONTENT_ONLY=true xargs -n1 -I{} make build
# This is designed to be compromise between being explicit and readability. We
# can allow the find to discover everything in vendor/, but then the fswatch
-# invocation becoems a huge unreadable wall of text that gets dumped into the
+# invocation becomes a huge unreadable wall of text that gets dumped into the
# shell. Instead, find all our own *.go files and then just tack the vendor/
# directory on separately (fswatch will watch it recursively).
-GO_FILES := $(shell find . -type f -name "*.go" ! -path "./org/*" ! -path "./vendor/*")
+GO_FILES := $(shell find . -type f -name "*.go" ! -path "./vendor/*")
# We recompile our Go source when a file changes, but we also rebuild the site
# because a change in source may have affected the build formula.
watch-go:
fswatch -o $(GO_FILES) vendor/ | xargs -n1 -I{} make install build
+
+#
+# Helpers
+#
+
+# Requires that variables necessary to make an AWS API call are in the
+# environment.
+check-aws-keys:
+ifndef AWS_ACCESS_KEY_ID
+ $(error AWS_ACCESS_KEY_ID is required)
+endif
+ifndef AWS_SECRET_ACCESS_KEY
+ $(error AWS_SECRET_ACCESS_KEY is required)
+endif
+
+# Requires that variables necessary to update a CloudFront distribution are in
+# the environment.
+check-cloudfront-id:
+ifndef CLOUDFRONT_ID
+ $(error CLOUDFRONT_ID is required)
+endif
+
+check-target-dir:
+ifndef TARGET_DIR
+ $(error TARGET_DIR is required)
+endif
diff --git a/README.md b/README.md
index 4f2269f96..89854272e 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,15 @@ committed to the master branch.
## Build
-Install Go 1.6+, set up and run [black-swan][black-swan], then:
+Install Go 1.9+, set up and run [blackswan][blackswan], then:
``` sh
go get -u github.com/ddollar/forego
-cp .env.sample
+cp .env.sample .env
+
+# Used to run the test suite.
+createdb sorg-test
# Compile Go executables.
make install
@@ -27,21 +30,51 @@ forego run make build
forego start
```
-Or an easy all-in-one:
+The project can be deployed to s3 using:
``` sh
-make install && forego run make build && forego start
+pip install awscli
+
+export AWS_ACCESS_KEY_ID=...
+export AWS_SECRET_ACCESS_KEY=...
+export S3_BUCKET=...
+make deploy
```
-The project can be deployed to s3 using:
+## Development
+
+Run the entire lifecycle like in CI:
``` sh
-AWS_ACCESS_KEY_ID=...
-AWS_SECRET_ACCESS_KEY=...
-S3_BUCKET=...
-make deploy
+make
+```
+
+Run the test suite:
+
+``` sh
+make test
```
-[black-swan]: https://github.com/brandur/black-swan
+Run a single package's test suite or single test:
+
+``` sh
+go test ./markdown
+go test ./markdown -run TestCollapseHTML
+```
+
+Get more verbose output while running tests:
+
+```
+go test -v ./markdown
+```
+
+## Vendoring Dependencies
+
+Dependencies are managed with dep. New ones can be vendored
+using these commands:
+
+ dep ensure -add github.com/foo/bar
+
+[blackswan]: https://github.com/brandur/blackswan
[brandur]: https://brandur.org
[org]: https://github.com/brandur/org
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 2d9256d1e..000000000
--- a/TODO.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# TODO
-
-* [ ] Test for the majority of functions in build's main.go.
-* [ ] Download images found in fragment frontmatter (maybe just stored them
- locally).
-* [ ] Remove orphaned files between builds.
-* [ ] Faster build by passing changes paths to program.
-
-## Done
-
-* [x] Move org content into project.
-* [x] Update README.
-* [x] Rewrite about page.
-* [x] Fix validation problems in Atom feed.
-* [x] Speed up the build (maybe incremental?).
-* [x] Fix logging: debug is currently too verbose for Travis runs, but normal
- is not verbose enough.
-* [x] Smarter asset symlinking that doesn't remove and create every time.
-* [x] Move to a leading slash system in paths.
-* [x] Don't use so many path constants ... it's not really helping.
-* [x] Atom feeds.
-* [x] Make sure that all viewport widths have been accounted for.
-* [x] Finish building out "about" and various other one-off pages.
-* [x] Figure out a schema for talks/about pages title.
-* [x] Render article and fragment drafts in development.
-* [x] Section headers in build's main.go; it's getting hard to track.
-* [x] Change conf DB checks to nil checks on `db`.
-* [x] Procfile rebuild on *.go file changes.
diff --git a/assets/assets.go b/assets/assets.go
index 2cadf29a5..f1a7ed7f2 100644
--- a/assets/assets.go
+++ b/assets/assets.go
@@ -1,67 +1,133 @@
package assets
import (
- "bufio"
"fmt"
"io"
- "net/http"
+ "io/ioutil"
"os"
+ "path"
+ "strings"
+ "time"
log "github.com/Sirupsen/logrus"
+ "github.com/yosssi/gcss"
)
-type Asset struct {
- URL string
- Target string
-}
+// CompileJavascripts compiles a set of JS files into a single large file by
+// appending them all to each other. Files are appended in alphabetical order
+// so we depend on the fact that there aren't too many interdependencies
+// between files. A common requirement can be given an underscore prefix to be
+// loaded first.
+func CompileJavascripts(inPath, outPath string) error {
+ start := time.Now()
+ defer func() {
+ log.Debugf("Compiled script assets in %v.", time.Now().Sub(start))
+ }()
+
+ log.Debugf("Building: %v", outPath)
+
+ javascriptInfos, err := ioutil.ReadDir(inPath)
+ if err != nil {
+ return err
+ }
+
+ outFile, err := os.Create(outPath)
+ if err != nil {
+ return err
+ }
+ defer outFile.Close()
+
+ for _, javascriptInfo := range javascriptInfos {
+ if isHidden(javascriptInfo.Name()) {
+ continue
+ }
-func Fetch(assets []Asset) error {
- client := &http.Client{}
+ log.Debugf("Including: %v", javascriptInfo.Name())
- for _, asset := range assets {
- err := fetchAsset(client, asset)
+ inFile, err := os.Open(path.Join(inPath, javascriptInfo.Name()))
if err != nil {
return err
}
+
+ outFile.WriteString("/* " + javascriptInfo.Name() + " */\n\n")
+ outFile.WriteString("(function() {\n\n")
+
+ // Ignore non-JS files in the directory (I have a README in there)
+ if strings.HasSuffix(javascriptInfo.Name(), ".js") {
+ _, err = io.Copy(outFile, inFile)
+ if err != nil {
+ return err
+ }
+ }
+
+ outFile.WriteString("\n\n")
+ outFile.WriteString("}).call(this);\n\n")
}
return nil
}
-func fetchAsset(client *http.Client, asset Asset) error {
- if _, err := os.Stat(asset.Target); !os.IsNotExist(err) {
- log.Debugf("Skipping asset because local file exists: %v", asset.URL)
- return nil
- }
+// CompileStylesheets compiles a set of stylesheet files into a single large
+// file by appending them all to each other. Files are appended in alphabetical
+// order so we depend on the fact that there aren't too many interdependencies
+// between files. CSS reset in particular is given an underscore prefix so that
+// it gets to load first.
+//
+// If a file has a ".sass" suffix, we attempt to render it as GCSS. This isn't
+// a perfect symmetry, but works well enough for these cases.
+func CompileStylesheets(inPath, outPath string) error {
+ start := time.Now()
+ defer func() {
+ log.Debugf("Compiled stylesheet assets in %v.", time.Now().Sub(start))
+ }()
- log.Debugf("Fetching asset: %v", asset.URL)
+ log.Debugf("Building: %v", outPath)
- resp, err := client.Get(asset.URL)
+ stylesheetInfos, err := ioutil.ReadDir(inPath)
if err != nil {
return err
}
- defer resp.Body.Close()
-
- if resp.StatusCode != 200 {
- return fmt.Errorf("Unexpected status code %d while fetching: %v",
- resp.StatusCode, asset.URL)
- }
- f, err := os.Create(asset.Target)
+ outFile, err := os.Create(outPath)
if err != nil {
return err
}
- defer f.Close()
+ defer outFile.Close()
- w := bufio.NewWriter(f)
+ for _, stylesheetInfo := range stylesheetInfos {
+ if isHidden(stylesheetInfo.Name()) {
+ continue
+ }
- // probably not needed
- defer w.Flush()
+ log.Debugf("Including: %v", stylesheetInfo.Name())
- _, err = io.Copy(w, resp.Body)
- if err != nil {
- return err
+ inFile, err := os.Open(path.Join(inPath, stylesheetInfo.Name()))
+ if err != nil {
+ return err
+ }
+
+ outFile.WriteString("/* " + stylesheetInfo.Name() + " */\n\n")
+
+ if strings.HasSuffix(stylesheetInfo.Name(), ".sass") {
+ _, err := gcss.Compile(outFile, inFile)
+ if err != nil {
+ return fmt.Errorf("Error compiling %v: %v",
+ stylesheetInfo.Name(), err)
+ }
+ } else {
+ _, err := io.Copy(outFile, inFile)
+ if err != nil {
+ return err
+ }
+ }
+
+ outFile.WriteString("\n\n")
}
return nil
}
+
+// Detects a hidden file, i.e. one that starts with a dot.
+func isHidden(file string) bool {
+ return strings.HasPrefix(file, ".")
+}
diff --git a/assets/assets_test.go b/assets/assets_test.go
index 2a172e065..8e070a573 100644
--- a/assets/assets_test.go
+++ b/assets/assets_test.go
@@ -2,25 +2,118 @@ package assets
import (
"io/ioutil"
- "os"
"testing"
assert "github.com/stretchr/testify/require"
)
-func TestFetch(t *testing.T) {
- tempfile, err := ioutil.TempFile("", "asset")
+func TestCompileJavascripts(t *testing.T) {
+ dir, err := ioutil.TempDir("", "javascripts")
+
+ file0 := dir + "/.hidden"
+ file1 := dir + "/file1.js"
+ file2 := dir + "/file2.js"
+ file3 := dir + "/file3.js"
+ out := dir + "/app.js"
+
+ // This file is hidden and doesn't show up in output.
+ err = ioutil.WriteFile(file0, []byte(`hidden`), 0755)
+ assert.NoError(t, err)
+
+ err = ioutil.WriteFile(file1, []byte(`function() { return "file1" }`), 0755)
+ assert.NoError(t, err)
+
+ err = ioutil.WriteFile(file2, []byte(`function() { return "file2" }`), 0755)
+ assert.NoError(t, err)
+
+ err = ioutil.WriteFile(file3, []byte(`function() { return "file3" }`), 0755)
+ assert.NoError(t, err)
+
+ err = CompileJavascripts(dir, out)
+ assert.NoError(t, err)
+
+ actual, err := ioutil.ReadFile(out)
+ assert.NoError(t, err)
+
+ expected := `/* file1.js */
+
+(function() {
+
+function() { return "file1" }
+
+}).call(this);
+
+/* file2.js */
+
+(function() {
+
+function() { return "file2" }
+
+}).call(this);
+
+/* file3.js */
+
+(function() {
+
+function() { return "file3" }
+
+}).call(this);
+
+`
+ assert.Equal(t, expected, string(actual))
+}
+
+func TestCompileStylesheets(t *testing.T) {
+ dir, err := ioutil.TempDir("", "stylesheets")
+
+ file0 := dir + "/.hidden"
+ file1 := dir + "/file1.sass"
+ file2 := dir + "/file2.sass"
+ file3 := dir + "/file3.css"
+ out := dir + "/app.css"
+
+ // This file is hidden and doesn't show up in output.
+ err = ioutil.WriteFile(file0, []byte("hidden"), 0755)
+ assert.NoError(t, err)
+
+ // The syntax of the first and second files is GCSS and the third is in
+ // CSS.
+ err = ioutil.WriteFile(file1, []byte("p\n margin: 10px"), 0755)
assert.NoError(t, err)
- defer os.Remove(tempfile.Name())
- assets := []Asset{
- {"http://localhost", tempfile.Name()},
- }
+ err = ioutil.WriteFile(file2, []byte("p\n padding: 10px"), 0755)
+ assert.NoError(t, err)
+
+ err = ioutil.WriteFile(file3, []byte("p {\n border: 10px;\n}"), 0755)
+ assert.NoError(t, err)
- // Because the temp file already exists, no fetch will be made.
- err = Fetch(assets)
+ err = CompileStylesheets(dir, out)
assert.NoError(t, err)
- // We should also have a real fetch test, but this is not currently
- // implemented.
+ actual, err := ioutil.ReadFile(out)
+ assert.NoError(t, err)
+
+ // Note that the first two files have no spacing in the output because they
+ // go through the GCSS compiler.
+ expected := `/* file1.sass */
+
+p{margin:10px;}
+
+/* file2.sass */
+
+p{padding:10px;}
+
+/* file3.css */
+
+p {
+ border: 10px;
+}
+
+`
+ assert.Equal(t, expected, string(actual))
+}
+
+func TestIsHidden(t *testing.T) {
+ assert.Equal(t, true, isHidden(".gitkeep"))
+ assert.Equal(t, false, isHidden("article"))
}
diff --git a/cmd/sorg-build/main.go b/cmd/sorg-build/main.go
index e8fc8c0bd..2a5a015e3 100644
--- a/cmd/sorg-build/main.go
+++ b/cmd/sorg-build/main.go
@@ -9,7 +9,6 @@ import (
"os"
"path"
"path/filepath"
- "regexp"
"sort"
"strings"
"time"
@@ -18,41 +17,18 @@ import (
"github.com/brandur/sorg"
"github.com/brandur/sorg/assets"
"github.com/brandur/sorg/atom"
+ "github.com/brandur/sorg/downloader"
"github.com/brandur/sorg/markdown"
+ "github.com/brandur/sorg/passages"
+ "github.com/brandur/sorg/pool"
"github.com/brandur/sorg/templatehelpers"
"github.com/brandur/sorg/toc"
"github.com/joeshaw/envdecode"
_ "github.com/lib/pq"
"github.com/yosssi/ace"
- "github.com/yosssi/gcss"
"gopkg.in/yaml.v2"
)
-var javascripts = []string{
- "jquery-1.7.2.js",
- "retina.js",
- "highcharts.js",
- "highcharts_theme.js",
- "highlight.pack.js",
- "main_sorg.js",
-}
-
-var stylesheets = []string{
- "_reset.sass",
- "main.sass",
- "about.sass",
- "fragments.sass",
- "index.sass",
- "photos.sass",
- "quotes.sass",
- "reading.sass",
- "runs.sass",
- "signature.sass",
- "solarized-light.css",
- "tenets.sass",
- "twitter.sass",
-}
-
//
// Types
//
@@ -72,15 +48,25 @@ type Article struct {
// rendered, and then added separately.
Content string `yaml:"-"`
+ // Draft indicates that the article is not yet published.
+ Draft bool `yaml:"-"`
+
// HNLink is an optional link to comments on Hacker News.
HNLink string `yaml:"hn_link"`
// Hook is a leading sentence or two to succinctly introduce the article.
Hook string `yaml:"hook"`
+ // HookImageURL is the URL for a hook image for the article (to be shown on
+ // the article index) if one was found.
+ HookImageURL string `yaml:"-"`
+
// Image is an optional image that may be included with an article.
Image string `yaml:"image"`
+ // Location is the geographical location where this article was written.
+ Location string `yaml:"location"`
+
// PublishedAt is when the article was published.
PublishedAt *time.Time `yaml:"published_at"`
@@ -97,13 +83,28 @@ type Article struct {
TOC string `yaml:"-"`
}
+// PublishingInfo produces a brief spiel about publication which is intended to
+// go into the left sidebar when an article is shown.
+func (a *Article) PublishingInfo() string {
+ return `
` + a.Title + ` was published on ` +
+ a.PublishedAt.Format("January 2, 2006") + ` from ` +
+ a.Location + `.
` + twitterInfo
+}
+
type articleByPublishedAt []*Article
func (a articleByPublishedAt) Len() int { return len(a) }
func (a articleByPublishedAt) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a articleByPublishedAt) Less(i, j int) bool { return a[i].PublishedAt.Before(*a[j].PublishedAt) }
-// Conf contains configuration information for the command.
+// articleYear holds a collection of articles grouped by year.
+type articleYear struct {
+ Year int
+ Articles []*Article
+}
+
+// Conf contains configuration information for the command. It's extracted from
+// environment variables.
type Conf struct {
// AtomAuthorName is the name of the author to include in Atom feeds.
AtomAuthorName string `env:"AUTHOR_NAME,default=Brandur Leach"`
@@ -115,8 +116,15 @@ type Conf struct {
// in order to extract books, tweets, runs, etc.
BlackSwanDatabaseURL string `env:"BLACK_SWAN_DATABASE_URL"`
+ // Concurrency is the number of build Goroutines that will be used to
+ // perform build work items.
+ Concurrency int `env:"CONCURRENCY,default=30"`
+
// Drafts is whether drafts of articles and fragments should be compiled
// along with their published versions.
+ //
+ // Activating drafts also prompts the creation of a robots.txt to make sure
+ // that drafts aren't inadvertently accessed by web crawlers.
Drafts bool `env:"DRAFTS,default=false"`
// ContentOnly tells the build step that it should build using only files
@@ -128,9 +136,22 @@ type Conf struct {
// GoogleAnalyticsID is the account identifier for Google Analytics to use.
GoogleAnalyticsID string `env:"GOOGLE_ANALYTICS_ID"`
+ // LocalFonts starts using locally downloaded versions of Google Fonts.
+ // This is not ideal for real deployment because you won't be able to
+ // leverage Google's CDN and the caching that goes with it, and may not get
+ // the font format for requesting browsers, but good for airplane rides
+ // where you otherwise wouldn't have the fonts.
+ LocalFonts bool `env:"LOCAL_FONTS,default=false"`
+
+ // NumAtomEntries is the number of entries to put in Atom feeds.
+ NumAtomEntries int `env:"NUM_ATOM_ENTRIES,default=20"`
+
// SiteURL is the absolute URL where the compiled site will be hosted.
SiteURL string `env:"SITE_URL,default=https://brandur.org"`
+ // TargetDir is the target location where the site will be built to.
+ TargetDir string `env:"TARGET_DIR,default=./public"`
+
// Verbose is whether the program will print debug output as it's running.
Verbose bool `env:"VERBOSE,default=false"`
}
@@ -138,11 +159,24 @@ type Conf struct {
// Fragment represents a fragment (that is, a short "stream of consciousness"
// style article) to be rendered.
type Fragment struct {
+ // Attributions are any attributions for content that may be included in
+ // the article (like an image in the header for example).
+ Attributions string `yaml:"attributions"`
+
// Content is the HTML content of the fragment. It isn't included as YAML
// frontmatter, and is rather split out of an fragment's Markdown file,
// rendered, and then added separately.
Content string `yaml:"-"`
+ // Draft indicates that the fragment is not yet published.
+ Draft bool `yaml:"-"`
+
+ // HNLink is an optional link to comments on Hacker News.
+ HNLink string `yaml:"hn_link"`
+
+ // Hook is a leading sentence or two to succinctly introduce the fragment.
+ Hook string `yaml:"hook"`
+
// Image is an optional image that may be included with a fragment.
Image string `yaml:"image"`
@@ -157,12 +191,45 @@ type Fragment struct {
Title string `yaml:"title"`
}
+// PublishingInfo produces a brief spiel about publication which is intended to
+// go into the left sidebar when a fragment is shown.
+func (f *Fragment) PublishingInfo() string {
+ return `
` + f.Title + ` was published on ` +
+ f.PublishedAt.Format("January 2, 2006") + `.
` +
+ twitterInfo
+}
+
type fragmentByPublishedAt []*Fragment
func (a fragmentByPublishedAt) Len() int { return len(a) }
func (a fragmentByPublishedAt) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a fragmentByPublishedAt) Less(i, j int) bool { return a[i].PublishedAt.Before(*a[j].PublishedAt) }
+// fragmentYear holds a collection of fragments grouped by year.
+type fragmentYear struct {
+ Year int
+ Fragments []*Fragment
+}
+
+// Page is the metadata for a static HTML page generated from an ACE file.
+// Currently the layouting system of ACE doesn't allow us to pass metadata up
+// very well, so we have this instead.
+type Page struct {
+ // BodyClass is the CSS class that will be assigned to the body tag when
+ // the page is rendered.
+ BodyClass string `yaml:"body_class"`
+
+ // Title is the HTML title that will be assigned to the page when it's
+ // rendered.
+ Title string `yaml:"title"`
+}
+
+type passageByPublishedAt []*passages.Passage
+
+func (p passageByPublishedAt) Len() int { return len(p) }
+func (p passageByPublishedAt) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
+func (p passageByPublishedAt) Less(i, j int) bool { return p[i].PublishedAt.Before(*p[j].PublishedAt) }
+
// Photo is a photography downloaded from Flickr.
type Photo struct {
// LargeImageURL is the location where the large-sized version of the photo
@@ -264,6 +331,28 @@ type tweetMonth struct {
Tweets []*Tweet
}
+// twitterCard represents a Twitter "card" (i.e. one of those rich media boxes
+// that sometimes appear under tweets official clients) for use in templates.
+type twitterCard struct {
+ // Description is the title to show in the card.
+ Title string
+
+ // Description is the description to show in the card.
+ Description string
+
+ // ImageURL is the URL to the image to show in the card. It should be
+ // absolute because Twitter will need to be able to fetch it from our
+ // servers. Leave blank if there is no image.
+ ImageURL string
+}
+
+//
+// Constants
+//
+
+const twitterInfo = `
`
+
//
// Variables
//
@@ -272,43 +361,6 @@ type tweetMonth struct {
// very many places and can probably be refactored as a local if desired.
var conf Conf
-var errBadFrontmatter = fmt.Errorf("Unable to split YAML frontmatter")
-
-// pagesVars contains meta information for static pages that are part of the
-// site. This mostly titles, but can also be body classes for custom styling.
-//
-// This isn't the best system, but was the cheapest way to accomplish what I
-// needed for the time being. It could probably use an overhaul to something
-// better at some point.
-var pagesVars = map[string]map[string]interface{}{
- "about": {
- "Title": "About",
- },
- "accidental": {
- "Title": "Accidental",
- "BodyClass": "quote",
- },
- "crying": {
- "Title": "Crying",
- "BodyClass": "quote",
- },
- "favors": {
- "Title": "Favors",
- "BodyClass": "quote",
- },
- "lies": {
- "Title": "Lies",
- "BodyClass": "quote",
- },
- "talks": {
- "Title": "Talks",
- },
- "that-sunny-dome": {
- "Title": "That Sunny Dome",
- "BodyClass": "quote",
- },
-}
-
//
// Main
//
@@ -335,159 +387,438 @@ func main() {
sorg.InitLog(conf.Verbose)
- err = sorg.CreateTargetDirs()
+ // This is where we stored "versioned" assets like compiled JS and CSS.
+ // These assets have a release number that we can increment and by
+ // extension quickly invalidate.
+ versionedAssetsDir := path.Join(conf.TargetDir, "assets", sorg.Release)
+
+ err = sorg.CreateOutputDirs(conf.TargetDir)
if err != nil {
log.Fatal(err)
}
- articles, err := compileArticles()
+ var tasks []*pool.Task
+
+ //
+ // Build step 0: dependency-free
+ //
+
+ tasks = nil
+
+ // Articles, fragments, and pages are are slightly special cases in that we
+ // parallelize the creation of all of them all at once. That is, every
+ // article will have a separately entry in our work queue.
+
+ var articles []*Article
+ articleChan := accumulateArticles(&articles)
+
+ var fragments []*Fragment
+ fragmentChan := accumulateFragments(&fragments)
+
+ var passages []*passages.Passage
+ passageChan := accumulatePassages(&passages)
+
+ articleTasks, err := tasksForArticles(articleChan)
if err != nil {
log.Fatal(err)
}
+ tasks = append(tasks, articleTasks...)
- fragments, err := compileFragments()
+ fragmentTasks, err := tasksForFragments(fragmentChan)
if err != nil {
log.Fatal(err)
}
+ tasks = append(tasks, fragmentTasks...)
- err = compileJavascripts(javascripts)
+ pageTasks, err := tasksForPages()
if err != nil {
log.Fatal(err)
}
+ tasks = append(tasks, pageTasks...)
- err = compilePages()
+ passageTasks, err := tasksForPassages(passageChan)
if err != nil {
log.Fatal(err)
}
+ tasks = append(tasks, passageTasks...)
- photos, err := compilePhotos(db)
- if err != nil {
- log.Fatal(err)
+ // Most other types are all one-off pages or other resources and only get a
+ // single entry each in the work queue.
+
+ var photos []*Photo
+ tasks = append(tasks, pool.NewTask(func() error {
+ var err error
+ photos, err = compilePhotos(db)
+ return err
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return assets.CompileJavascripts(
+ path.Join(sorg.ContentDir, "javascripts"),
+ path.Join(versionedAssetsDir, "app.js"))
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileReading(db)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileRuns(db)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileRobots(path.Join(conf.TargetDir, "robots.txt"))
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return assets.CompileStylesheets(
+ path.Join(sorg.ContentDir, "stylesheets"),
+ path.Join(versionedAssetsDir, "app.css"))
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileTwitter(db)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return linkImages()
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return linkFonts()
+ }))
+
+ if !runTasks(tasks) {
+ os.Exit(1)
}
- err = compileReading(db)
- if err != nil {
- log.Fatal(err)
+ // Free up any Goroutines still waiting.
+ close(articleChan)
+ close(fragmentChan)
+ close(passageChan)
+
+ //
+ // Build step 1: any tasks dependent on the results of step 0.
+ //
+ // This includes build output like index pages and RSS feeds.
+ //
+
+ tasks = nil
+
+ sort.Sort(sort.Reverse(articleByPublishedAt(articles)))
+ sort.Sort(sort.Reverse(fragmentByPublishedAt(fragments)))
+ sort.Sort(sort.Reverse(passageByPublishedAt(passages)))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileArticlesFeed(articles)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileArticlesIndex(articles)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileFragmentsFeed(fragments)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileFragmentsIndex(fragments)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compilePassagesIndex(passages)
+ }))
+
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compileHome(articles, fragments, photos)
+ }))
+
+ if !runTasks(tasks) {
+ os.Exit(1)
}
+}
+
+//
+// Compilation functions
+//
+// These functions perform the heavy-lifting in compiling the site's resources.
+// They are normally run concurrently.
+//
- err = compileRuns(db)
+func compileArticle(dir, name string, draft bool) (*Article, error) {
+ inPath := path.Join(dir, name)
+
+ raw, err := ioutil.ReadFile(inPath)
if err != nil {
- log.Fatal(err)
+ return nil, err
}
- err = compileStylesheets(stylesheets)
+ frontmatter, content, err := sorg.SplitFrontmatter(string(raw))
if err != nil {
- log.Fatal(err)
+ return nil, err
}
- err = compileTwitter(db)
+ var article Article
+ err = yaml.Unmarshal([]byte(frontmatter), &article)
if err != nil {
- log.Fatal(err)
+ return nil, err
}
- err = compileHome(articles, fragments, photos)
+ article.Draft = draft
+ article.Slug = strings.Replace(name, ".md", "", -1)
+
+ if article.Title == "" {
+ return nil, fmt.Errorf("No title for article: %v", inPath)
+ }
+
+ if article.PublishedAt == nil {
+ return nil, fmt.Errorf("No publish date for article: %v", inPath)
+ }
+
+ article.Content = markdown.Render(content, nil)
+
+ article.TOC, err = toc.Render(article.Content)
if err != nil {
- log.Fatal(err)
+ return nil, err
+ }
+
+ format, ok := pathAsImage(
+ path.Join(sorg.ContentDir, "images", article.Slug, "hook"),
+ )
+ if ok {
+ article.HookImageURL = "/assets/" + article.Slug + "/hook." + format
}
- err = linkImageAssets()
+ if err != nil && !os.IsNotExist(err) {
+ return nil, err
+ }
+
+ card := &twitterCard{
+ Title: article.Title,
+ Description: article.Hook,
+ }
+ format, ok = pathAsImage(
+ path.Join(sorg.ContentDir, "images", article.Slug, "twitter@2x"),
+ )
+ if ok {
+ card.ImageURL = sorg.AbsoluteURL + "/assets/" + article.Slug + "/twitter@2x." + format
+ }
+
+ locals := getLocals(article.Title, map[string]interface{}{
+ "Article": article,
+ "PublishingInfo": article.PublishingInfo(),
+ "TwitterCard": card,
+ })
+
+ err = renderView(sorg.MainLayout, sorg.ViewsDir+"/articles/show",
+ path.Join(conf.TargetDir, article.Slug), locals)
if err != nil {
- log.Fatal(err)
+ return nil, err
}
-}
-//
-// Compilation functions
-//
-// These functions are the main entry points for compiling the site's
-// resources.
-//
+ return &article, nil
+}
-func compileArticles() ([]*Article, error) {
+func compileArticlesFeed(articles []*Article) error {
start := time.Now()
defer func() {
- log.Debugf("Compiled articles in %v.", time.Now().Sub(start))
+ log.Debugf("Compiled articles feed in %v.", time.Now().Sub(start))
}()
- articles, err := compileArticlesDir(sorg.ContentDir + "/articles")
- if err != nil {
- return nil, err
+ feed := &atom.Feed{
+ Title: "Articles - brandur.org",
+ ID: "tag:brandur.org.org,2013:/articles",
+
+ Links: []*atom.Link{
+ {Rel: "self", Type: "application/atom+xml", Href: "https://brandur.org/articles.atom"},
+ {Rel: "alternate", Type: "text/html", Href: "https://brandur.org"},
+ },
}
- if conf.Drafts {
- drafts, err := compileArticlesDir(sorg.ContentDir + "/drafts")
- if err != nil {
- return nil, err
+ if len(articles) > 0 {
+ feed.Updated = *articles[0].PublishedAt
+ }
+
+ for i, article := range articles {
+ if i >= conf.NumAtomEntries {
+ break
}
- articles = append(articles, drafts...)
+ entry := &atom.Entry{
+ Title: article.Title,
+ Content: &atom.EntryContent{Content: article.Content, Type: "html"},
+ Published: *article.PublishedAt,
+ Updated: *article.PublishedAt,
+ Link: &atom.Link{Href: conf.SiteURL + "/" + article.Slug},
+ ID: "tag:brandur.org," + article.PublishedAt.Format("2006-01-02") + ":" + article.Slug,
+
+ AuthorName: conf.AtomAuthorName,
+ AuthorURI: conf.AtomAuthorURL,
+ }
+ feed.Entries = append(feed.Entries, entry)
}
- sort.Sort(sort.Reverse(articleByPublishedAt(articles)))
+ f, err := os.Create(conf.TargetDir + "/articles.atom")
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ return feed.Encode(f, " ")
+}
+
+func compileArticlesIndex(articles []*Article) error {
+ start := time.Now()
+ defer func() {
+ log.Debugf("Compiled articles index in %v.", time.Now().Sub(start))
+ }()
+
+ articlesByYear := groupArticlesByYear(articles)
locals := getLocals("Articles", map[string]interface{}{
- "Articles": articles,
+ "ArticlesByYear": articlesByYear,
})
- err = renderView(sorg.MainLayout, sorg.ViewsDir+"/articles/index",
- sorg.TargetDir+"/articles/index.html", locals)
+ err := renderView(sorg.MainLayout, sorg.ViewsDir+"/articles/index",
+ conf.TargetDir+"/articles/index.html", locals)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func compileFragment(dir, name string, draft bool) (*Fragment, error) {
+ inPath := path.Join(dir, name)
+
+ raw, err := ioutil.ReadFile(inPath)
if err != nil {
return nil, err
}
- err = compileArticlesFeed(articles)
+ frontmatter, content, err := sorg.SplitFrontmatter(string(raw))
if err != nil {
return nil, err
}
- return articles, nil
+ var fragment Fragment
+ err = yaml.Unmarshal([]byte(frontmatter), &fragment)
+ if err != nil {
+ return nil, err
+ }
+
+ fragment.Draft = draft
+ fragment.Slug = strings.Replace(name, ".md", "", -1)
+
+ if fragment.Title == "" {
+ return nil, fmt.Errorf("No title for fragment: %v", inPath)
+ }
+
+ if fragment.PublishedAt == nil {
+ return nil, fmt.Errorf("No publish date for fragment: %v", inPath)
+ }
+
+ fragment.Content = markdown.Render(content, nil)
+
+ // A lot of fragments still have unwritten hooks, so only add a card where
+ // a fragment has a configured Twitter image for the time being.
+ var card *twitterCard
+ format, ok := pathAsImage(
+ path.Join(sorg.ContentDir, "images", "fragments", fragment.Slug, "twitter@2x"),
+ )
+ if ok {
+ card = &twitterCard{
+ ImageURL: sorg.AbsoluteURL + "/assets/fragments/" + fragment.Slug + "/twitter@2x." + format,
+ Title: fragment.Title,
+ Description: fragment.Hook,
+ }
+ }
+
+ locals := getLocals(fragment.Title, map[string]interface{}{
+ "Fragment": fragment,
+ "PublishingInfo": fragment.PublishingInfo(),
+ "TwitterCard": card,
+ })
+
+ err = renderView(sorg.MainLayout, sorg.ViewsDir+"/fragments/show",
+ conf.TargetDir+"/fragments/"+fragment.Slug, locals)
+ if err != nil {
+ return nil, err
+ }
+
+ return &fragment, nil
}
-func compileFragments() ([]*Fragment, error) {
+func compileFragmentsFeed(fragments []*Fragment) error {
start := time.Now()
defer func() {
- log.Debugf("Compiled fragments in %v.", time.Now().Sub(start))
+ log.Debugf("Compiled fragments feed in %v.", time.Now().Sub(start))
}()
- fragments, err := compileFragmentsDir(sorg.ContentDir + "/fragments")
- if err != nil {
- return nil, err
+ feed := &atom.Feed{
+ Title: "Fragments - brandur.org",
+ ID: "tag:brandur.org.org,2013:/fragments",
+
+ Links: []*atom.Link{
+ {Rel: "self", Type: "application/atom+xml", Href: "https://brandur.org/fragments.atom"},
+ {Rel: "alternate", Type: "text/html", Href: "https://brandur.org"},
+ },
}
- if conf.Drafts {
- drafts, err := compileFragmentsDir(sorg.ContentDir + "/fragments-drafts")
- if err != nil {
- return nil, err
+ if len(fragments) > 0 {
+ feed.Updated = *fragments[0].PublishedAt
+ }
+
+ for i, fragment := range fragments {
+ if i >= conf.NumAtomEntries {
+ break
}
- fragments = append(fragments, drafts...)
+ entry := &atom.Entry{
+ Title: fragment.Title,
+ Content: &atom.EntryContent{Content: fragment.Content, Type: "html"},
+ Published: *fragment.PublishedAt,
+ Updated: *fragment.PublishedAt,
+ Link: &atom.Link{Href: conf.SiteURL + "/fragments/" + fragment.Slug},
+ ID: "tag:brandur.org," + fragment.PublishedAt.Format("2006-01-02") + ":fragments/" + fragment.Slug,
+
+ AuthorName: conf.AtomAuthorName,
+ AuthorURI: conf.AtomAuthorURL,
+ }
+ feed.Entries = append(feed.Entries, entry)
}
- sort.Sort(sort.Reverse(fragmentByPublishedAt(fragments)))
+ f, err := os.Create(conf.TargetDir + "/fragments.atom")
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ return feed.Encode(f, " ")
+}
+
+func compileFragmentsIndex(fragments []*Fragment) error {
+ start := time.Now()
+ defer func() {
+ log.Debugf("Compiled fragments index in %v.", time.Now().Sub(start))
+ }()
+
+ fragmentsByYear := groupFragmentsByYear(fragments)
locals := getLocals("Fragments", map[string]interface{}{
- "Fragments": fragments,
+ "FragmentsByYear": fragmentsByYear,
})
- err = renderView(sorg.MainLayout, sorg.ViewsDir+"/fragments/index",
- sorg.TargetDir+"/fragments/index.html", locals)
- if err != nil {
- return nil, err
- }
-
- err = compileFragmentsFeed(fragments)
+ err := renderView(sorg.MainLayout, sorg.ViewsDir+"/fragments/index",
+ conf.TargetDir+"/fragments/index.html", locals)
if err != nil {
- return nil, err
+ return err
}
- return fragments, nil
+ return nil
}
func compileHome(articles []*Article, fragments []*Fragment, photos []*Photo) error {
- if conf.ContentOnly {
- return nil
- }
-
start := time.Now()
defer func() {
log.Debugf("Compiled home in %v.", time.Now().Sub(start))
@@ -501,8 +832,8 @@ func compileHome(articles []*Article, fragments []*Fragment, photos []*Photo) er
fragments = fragments[0:5]
}
- if len(photos) > 27 {
- photos = photos[0:27]
+ if len(photos) > 9 {
+ photos = photos[0:9]
}
locals := getLocals("brandur.org", map[string]interface{}{
@@ -513,7 +844,7 @@ func compileHome(articles []*Article, fragments []*Fragment, photos []*Photo) er
})
err := renderView(sorg.MainLayout, sorg.ViewsDir+"/index",
- sorg.TargetDir+"/index.html", locals)
+ conf.TargetDir+"/index.html", locals)
if err != nil {
return err
}
@@ -521,100 +852,69 @@ func compileHome(articles []*Article, fragments []*Fragment, photos []*Photo) er
return nil
}
-func compileJavascripts(javascripts []string) error {
- start := time.Now()
- defer func() {
- log.Debugf("Compiled script assets in %v.", time.Now().Sub(start))
- }()
+func compilePage(pagesMeta map[string]*Page, dir, name string) error {
+ // Remove the "./pages" directory, but keep the rest of the path.
+ //
+ // Looks something like "about".
+ pagePath := strings.TrimPrefix(dir, sorg.PagesDir) + name
- outPath := sorg.TargetVersionedAssetsDir + "/app.js"
- log.Debugf("Building: %v", outPath)
+ // Looks something like "./public/about".
+ target := path.Join(conf.TargetDir, pagePath)
- outFile, err := os.Create(outPath)
- if err != nil {
- return err
+ // Put a ".html" on if this page is an index. This will allow our local
+ // server to serve it at a directory path, and our upload script is smart
+ // enough to do the right thing with it as well.
+ if path.Base(pagePath) == "index" {
+ target += ".html"
}
- defer outFile.Close()
- for _, javascript := range javascripts {
- inPath := sorg.ContentDir + "/assets/javascripts/" + javascript
- log.Debugf("Including: %v", inPath)
+ locals := map[string]interface{}{
+ "BodyClass": "",
+ "Title": "Untitled Page",
+ }
- inFile, err := os.Open(inPath)
- if err != nil {
- return err
+ pageMeta, ok := pagesMeta[pagePath]
+ if ok {
+ locals = map[string]interface{}{
+ "BodyClass": pageMeta.BodyClass,
+ "Title": pageMeta.Title,
}
+ } else {
+ log.Errorf("No page meta information: %v", pagePath)
+ }
- outFile.WriteString("/* " + javascript + " */\n\n")
- outFile.WriteString("(function() {\n\n")
+ locals = getLocals("Page", locals)
- _, err = io.Copy(outFile, inFile)
- if err != nil {
- return err
- }
+ err := os.MkdirAll(path.Join(conf.TargetDir, dir), 0755)
+ if err != nil {
+ return err
+ }
- outFile.WriteString("\n\n")
- outFile.WriteString("}).call(this);\n\n")
+ err = renderView(sorg.MainLayout, path.Join(dir, name),
+ target, locals)
+ if err != nil {
+ return err
}
return nil
}
-func compilePages() error {
+func compilePassagesIndex(passages []*passages.Passage) error {
start := time.Now()
defer func() {
- log.Debugf("Compiled pages in %v.", time.Now().Sub(start))
+ log.Debugf("Compiled passages index in %v.", time.Now().Sub(start))
}()
- return compilePagesDir(sorg.PagesDir)
-}
-
-func compilePagesDir(dir string) error {
- log.Debugf("Descending into pages directory: %v", dir)
+ locals := getLocals("Passages", map[string]interface{}{
+ "Passages": passages,
+ })
- fileInfos, err := ioutil.ReadDir(dir)
+ err := renderView(sorg.PassageLayout, sorg.ViewsDir+"/passages/index",
+ conf.TargetDir+"/passages/index.html", locals)
if err != nil {
return err
}
- for _, fileInfo := range fileInfos {
- if fileInfo.IsDir() {
- err := compilePagesDir(dir + fileInfo.Name())
- if err != nil {
- return err
- }
- } else {
- // Subtract 4 for the ".ace" extension.
- name := fileInfo.Name()[0 : len(fileInfo.Name())-4]
-
- // Remove the "./pages" directory, but keep the rest of the path.
- //
- // Looks something like "about".
- pagePath := strings.TrimPrefix(dir, sorg.PagesDir) + name
-
- // Looks something like "./public/about".
- target := sorg.TargetDir + "/" + pagePath
-
- locals, ok := pagesVars[pagePath]
- if !ok {
- log.Errorf("No page meta information: %v", pagePath)
- }
-
- locals = getLocals("Page", locals)
-
- err := os.MkdirAll(sorg.TargetDir+"/"+dir, 0755)
- if err != nil {
- return err
- }
-
- err = renderView(sorg.MainLayout, dir+"/"+name,
- target, locals)
- if err != nil {
- return err
- }
- }
- }
-
return nil
}
@@ -633,19 +933,51 @@ func compilePhotos(db *sql.DB) ([]*Photo, error) {
return nil, err
}
+ // Every once in a while go and copy photos into this Git repository so
+ // that the build can use them as a cached version. The Flickr dependency
+ // introduces some brittleness to the build process because they're far
+ // from perfectly reliable, and the more photos that we need to fetch, the
+ // more likely we are to fail. Ideally here we're able to use cached
+ // versions for almost everything, and fetch a relatively few number of
+ // files over the network.
+ //
+ // These can be synced from the built bucket with:
+ //
+ // aws s3 sync s3://brandur.org/assets/photos/ content/photos/
+ //
+ cacheDir := path.Join(sorg.ContentDir, "photos")
+
// Keep a published copy of all the photos that we need.
- var photoAssets []assets.Asset
+ var photoFiles []*downloader.File
for _, photo := range photos {
- photoAssets = append(photoAssets,
- assets.Asset{URL: photo.LargeImageURL,
- Target: sorg.TargetDir + "/assets/photos/" + photo.Slug + "@2x.jpg"},
- assets.Asset{URL: photo.MediumImageURL,
- Target: sorg.TargetDir + "/assets/photos/" + photo.Slug + ".jpg"},
- )
+ image1x := photo.Slug + ".jpg"
+ image2x := photo.Slug + "@2x.jpg"
+
+ if fileExists(path.Join(cacheDir, image1x)) &&
+ fileExists(path.Join(cacheDir, image2x)) {
+
+ log.Debugf("Using cached photos: %v / %v", image1x, image2x)
+
+ for _, image := range []string{image1x, image2x} {
+ err := copyFile(
+ path.Join(cacheDir, image),
+ path.Join(conf.TargetDir, "assets", "photos", image))
+ if err != nil {
+ return nil, err
+ }
+ }
+ } else {
+ photoFiles = append(photoFiles,
+ &downloader.File{URL: photo.MediumImageURL,
+ Target: path.Join(conf.TargetDir, "assets", "photos", image1x)},
+ &downloader.File{URL: photo.LargeImageURL,
+ Target: path.Join(conf.TargetDir, "assets", "photos", image2x)},
+ )
+ }
}
- log.Debugf("Fetching %d photo(s)", len(photoAssets))
- err = assets.Fetch(photoAssets)
+ log.Debugf("Fetching %d photo(s)", len(photoFiles))
+ err = downloader.Fetch(photoFiles)
if err != nil {
return nil, err
}
@@ -656,7 +988,7 @@ func compilePhotos(db *sql.DB) ([]*Photo, error) {
})
err = renderView(sorg.MainLayout, sorg.ViewsDir+"/photos/index",
- sorg.TargetDir+"/photos/index.html", locals)
+ conf.TargetDir+"/photos/index.html", locals)
if err != nil {
return nil, err
}
@@ -705,11 +1037,33 @@ func compileReading(db *sql.DB) error {
"PagesByYearYCounts": pagesByYearYCounts,
})
- err = renderView(sorg.MainLayout, sorg.ViewsDir+"/reading/index",
- sorg.TargetDir+"/reading/index.html", locals)
+ err = renderView(sorg.MainLayout, sorg.ViewsDir+"/reading/index",
+ conf.TargetDir+"/reading/index.html", locals)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
+func compileRobots(outPath string) error {
+ if !conf.Drafts {
+ return nil
+ }
+
+ outFile, err := os.Create(outPath)
if err != nil {
return err
}
+ defer outFile.Close()
+
+ // Allow Twitterbot so that we can preview card images on dev.
+ outFile.WriteString(
+ "User-agent: Twitterbot\n" +
+ "Disallow:\n" +
+ "\n" +
+ "User-agent: *\n" +
+ "Disallow: /")
return nil
}
@@ -752,7 +1106,7 @@ func compileRuns(db *sql.DB) error {
})
err = renderView(sorg.MainLayout, sorg.ViewsDir+"/runs/index",
- sorg.TargetDir+"/runs/index.html", locals)
+ conf.TargetDir+"/runs/index.html", locals)
if err != nil {
return err
}
@@ -760,6 +1114,26 @@ func compileRuns(db *sql.DB) error {
return nil
}
+func compilePassage(dir, name string, draft bool) (*passages.Passage, error) {
+ passage, err := passages.Compile(dir, name, draft, false)
+ if err != nil {
+ return nil, err
+ }
+
+ locals := getLocals(passage.Title, map[string]interface{}{
+ "InEmail": false,
+ "Passage": passage,
+ })
+
+ err = renderView(sorg.PassageLayout, sorg.ViewsDir+"/passages/show",
+ conf.TargetDir+"/passages/"+passage.Slug, locals)
+ if err != nil {
+ return nil, err
+ }
+
+ return passage, nil
+}
+
func compileTwitter(db *sql.DB) error {
if conf.ContentOnly {
return nil
@@ -811,7 +1185,7 @@ func compileTwitter(db *sql.DB) error {
})
err = renderView(sorg.MainLayout, sorg.ViewsDir+"/twitter/index",
- sorg.TargetDir+"/twitter/"+page, locals)
+ conf.TargetDir+"/twitter/"+page, locals)
if err != nil {
return err
}
@@ -820,57 +1194,32 @@ func compileTwitter(db *sql.DB) error {
return nil
}
-func compileStylesheets(stylesheets []string) error {
+func linkFonts() error {
start := time.Now()
defer func() {
- log.Debugf("Compiled stylesheet assets in %v.", time.Now().Sub(start))
+ log.Debugf("Linked font assets in %v.", time.Now().Sub(start))
}()
- outPath := sorg.TargetVersionedAssetsDir + "/app.css"
- log.Debugf("Building: %v", outPath)
-
- outFile, err := os.Create(outPath)
+ source, err := filepath.Abs(sorg.ContentDir + "/fonts")
if err != nil {
return err
}
- defer outFile.Close()
-
- for _, stylesheet := range stylesheets {
- inPath := sorg.ContentDir + "/assets/stylesheets/" + stylesheet
- log.Debugf("Including: %v", inPath)
-
- inFile, err := os.Open(inPath)
- if err != nil {
- return err
- }
-
- outFile.WriteString("/* " + stylesheet + " */\n\n")
-
- if strings.HasSuffix(stylesheet, ".sass") {
- _, err := gcss.Compile(outFile, inFile)
- if err != nil {
- return fmt.Errorf("Error compiling %v: %v", inPath, err)
- }
- } else {
- _, err := io.Copy(outFile, inFile)
- if err != nil {
- return err
- }
- }
- outFile.WriteString("\n\n")
+ dest, err := filepath.Abs(conf.TargetDir + "/assets/fonts/")
+ if err != nil {
+ return err
}
- return nil
+ return ensureSymlink(source, dest)
}
-func linkImageAssets() error {
+func linkImages() error {
start := time.Now()
defer func() {
log.Debugf("Linked image assets in %v.", time.Now().Sub(start))
}()
- assets, err := ioutil.ReadDir(sorg.ContentDir + "/assets/images")
+ assets, err := ioutil.ReadDir(sorg.ContentDir + "/images")
if err != nil {
return err
}
@@ -878,17 +1227,17 @@ func linkImageAssets() error {
for _, asset := range assets {
// we use absolute paths for source and destination because not doing
// so can result in some weird symbolic link inception
- source, err := filepath.Abs(sorg.ContentDir + "/assets/images/" + asset.Name())
+ source, err := filepath.Abs(sorg.ContentDir + "/images/" + asset.Name())
if err != nil {
return err
}
- dest, err := filepath.Abs(sorg.TargetDir + "/assets/" + asset.Name())
+ dest, err := filepath.Abs(conf.TargetDir + "/assets/" + asset.Name())
if err != nil {
return err
}
- err = ensureSymbolicLink(source, dest)
+ err = ensureSymlink(source, dest)
if err != nil {
return err
}
@@ -898,210 +1247,267 @@ func linkImageAssets() error {
}
//
-// Other functions
+// Task generation functions
//
-// Any other functions. Try to keep them alphabetized.
+// These functions are the main entry points for compiling the site's
+// resources.
//
-func compileArticlesDir(dir string) ([]*Article, error) {
- articleInfos, err := ioutil.ReadDir(dir)
+func tasksForArticles(articleChan chan *Article) ([]*pool.Task, error) {
+ tasks, err := tasksForArticlesDir(articleChan, sorg.ContentDir+"/articles", false)
if err != nil {
return nil, err
}
- var articles []*Article
-
- for _, articleInfo := range articleInfos {
- if isHidden(articleInfo.Name()) {
- continue
- }
-
- inPath := dir + "/" + articleInfo.Name()
-
- raw, err := ioutil.ReadFile(inPath)
- if err != nil {
- return nil, err
- }
-
- frontmatter, content, err := splitFrontmatter(string(raw))
+ if conf.Drafts {
+ draftTasks, err := tasksForArticlesDir(articleChan,
+ sorg.ContentDir+"/drafts", true)
if err != nil {
return nil, err
}
- var article Article
- articles = append(articles, &article)
+ tasks = append(tasks, draftTasks...)
+ }
- err = yaml.Unmarshal([]byte(frontmatter), &article)
- if err != nil {
- return nil, err
- }
+ return tasks, nil
+}
- article.Slug = strings.Replace(articleInfo.Name(), ".md", "", -1)
+func tasksForArticlesDir(articleChan chan *Article, dir string, draft bool) ([]*pool.Task, error) {
+ articleInfos, err := ioutil.ReadDir(dir)
+ if err != nil {
+ return nil, err
+ }
- if article.Title == "" {
- return nil, fmt.Errorf("No title for article: %v", inPath)
+ var tasks []*pool.Task
+ for _, articleInfo := range articleInfos {
+ if isHidden(articleInfo.Name()) {
+ continue
}
- if article.PublishedAt == nil {
- return nil, fmt.Errorf("No publish date for article: %v", inPath)
- }
+ name := articleInfo.Name()
+ tasks = append(tasks, pool.NewTask(func() error {
+ article, err := compileArticle(dir, name, draft)
+ if err != nil {
+ return err
+ }
- article.Content = markdown.Render(content)
+ articleChan <- article
+ return nil
+ }))
+ }
- article.TOC, err = toc.Render(article.Content)
- if err != nil {
- return nil, err
- }
+ return tasks, nil
+}
- locals := getLocals(article.Title, map[string]interface{}{
- "Article": article,
- })
+func tasksForFragments(fragmentChan chan *Fragment) ([]*pool.Task, error) {
+ tasks, err := tasksForFragmentsDir(fragmentChan, sorg.ContentDir+"/fragments", false)
+ if err != nil {
+ return nil, err
+ }
- err = renderView(sorg.MainLayout, sorg.ViewsDir+"/articles/show",
- sorg.TargetDir+"/"+article.Slug, locals)
+ if conf.Drafts {
+ draftTasks, err := tasksForFragmentsDir(fragmentChan,
+ sorg.ContentDir+"/fragments-drafts", true)
if err != nil {
return nil, err
}
+
+ tasks = append(tasks, draftTasks...)
}
- return articles, nil
+ return tasks, nil
}
-func compileArticlesFeed(articles []*Article) error {
- feed := &atom.Feed{
- Title: "Articles - brandur.org",
- ID: "tag:brandur.org.org,2013:/articles",
-
- Links: []*atom.Link{
- {Rel: "self", Type: "application/atom+xml", Href: "https://brandur.org/articles.atom"},
- {Rel: "alternate", Type: "text/html", Href: "https://brandur.org"},
- },
+func tasksForFragmentsDir(fragmentChan chan *Fragment, dir string, draft bool) ([]*pool.Task, error) {
+ fragmentInfos, err := ioutil.ReadDir(dir)
+ if err != nil {
+ return nil, err
}
- if len(articles) > 0 {
- feed.Updated = *articles[0].PublishedAt
+ var tasks []*pool.Task
+ for _, fragmentInfo := range fragmentInfos {
+ if isHidden(fragmentInfo.Name()) {
+ continue
+ }
+
+ name := fragmentInfo.Name()
+ tasks = append(tasks, pool.NewTask(func() error {
+ fragment, err := compileFragment(dir, name, draft)
+ if err != nil {
+ return err
+ }
+
+ fragmentChan <- fragment
+ return nil
+ }))
}
- for _, article := range articles {
- entry := &atom.Entry{
- Title: article.Title,
- Content: &atom.EntryContent{Content: article.Content, Type: "html"},
- Published: *article.PublishedAt,
- Updated: *article.PublishedAt,
- Link: &atom.Link{Href: conf.SiteURL + "/" + article.Slug},
- ID: "tag:brandur.org," + article.PublishedAt.Format("2006-01-02") + ":" + article.Slug,
+ return tasks, nil
+}
- AuthorName: conf.AtomAuthorName,
- AuthorURI: conf.AtomAuthorURL,
- }
- feed.Entries = append(feed.Entries, entry)
+func tasksForPages() ([]*pool.Task, error) {
+ meta, err := ioutil.ReadFile(path.Join(sorg.PagesDir, "meta.yaml"))
+ if err != nil {
+ return nil, err
}
- f, err := os.Create(sorg.TargetDir + "/articles.atom")
+ var pagesMeta map[string]*Page
+ err = yaml.Unmarshal(meta, &pagesMeta)
if err != nil {
- return err
+ return nil, err
}
- defer f.Close()
- return feed.Encode(f, " ")
+ return tasksForPagesDir(pagesMeta, sorg.PagesDir)
}
-func compileFragmentsDir(dir string) ([]*Fragment, error) {
- fragmentInfos, err := ioutil.ReadDir(dir)
+func tasksForPagesDir(pagesMeta map[string]*Page, dir string) ([]*pool.Task, error) {
+ log.Debugf("Descending into pages directory: %v", dir)
+
+ fileInfos, err := ioutil.ReadDir(dir)
if err != nil {
return nil, err
}
- var fragments []*Fragment
+ var tasks []*pool.Task
+ for _, fileInfo := range fileInfos {
+ if fileInfo.IsDir() {
+ subtasks, err := tasksForPagesDir(pagesMeta, dir+fileInfo.Name())
+ if err != nil {
+ return nil, err
+ }
+ tasks = append(tasks, subtasks...)
+ } else {
+ if isHidden(fileInfo.Name()) {
+ continue
+ }
- for _, fragmentInfo := range fragmentInfos {
- if isHidden(fragmentInfo.Name()) {
- continue
- }
+ if filepath.Ext(fileInfo.Name()) != ".ace" {
+ continue
+ }
- inPath := dir + "/" + fragmentInfo.Name()
+ // Subtract 4 for the ".ace" extension.
+ name := fileInfo.Name()[0 : len(fileInfo.Name())-4]
- raw, err := ioutil.ReadFile(inPath)
- if err != nil {
- return nil, err
+ tasks = append(tasks, pool.NewTask(func() error {
+ return compilePage(pagesMeta, dir, name)
+ }))
}
+ }
- frontmatter, content, err := splitFrontmatter(string(raw))
- if err != nil {
- return nil, err
- }
+ return tasks, nil
+}
- var fragment Fragment
- fragments = append(fragments, &fragment)
+func tasksForPassages(passageChan chan *passages.Passage) ([]*pool.Task, error) {
+ tasks, err := tasksForPassagesDir(passageChan, sorg.ContentDir+"/passages", false)
+ if err != nil {
+ return nil, err
+ }
- err = yaml.Unmarshal([]byte(frontmatter), &fragment)
+ if conf.Drafts {
+ draftTasks, err := tasksForPassagesDir(passageChan,
+ sorg.ContentDir+"/passages-drafts", true)
if err != nil {
return nil, err
}
- fragment.Slug = strings.Replace(fragmentInfo.Name(), ".md", "", -1)
+ tasks = append(tasks, draftTasks...)
+ }
- if fragment.Title == "" {
- return nil, fmt.Errorf("No title for fragment: %v", inPath)
- }
+ return tasks, nil
+}
- if fragment.PublishedAt == nil {
- return nil, fmt.Errorf("No publish date for fragment: %v", inPath)
- }
+func tasksForPassagesDir(passageChan chan *passages.Passage, dir string, draft bool) ([]*pool.Task, error) {
+ passageInfos, err := ioutil.ReadDir(dir)
+ if err != nil {
+ return nil, err
+ }
- fragment.Content = markdown.Render(content)
+ var tasks []*pool.Task
+ for _, passageInfo := range passageInfos {
+ if isHidden(passageInfo.Name()) {
+ continue
+ }
- locals := getLocals(fragment.Title, map[string]interface{}{
- "Fragment": fragment,
- })
+ name := passageInfo.Name()
+ tasks = append(tasks, pool.NewTask(func() error {
+ passage, err := compilePassage(dir, name, draft)
+ if err != nil {
+ return err
+ }
- err = renderView(sorg.MainLayout, sorg.ViewsDir+"/fragments/show",
- sorg.TargetDir+"/fragments/"+fragment.Slug, locals)
- if err != nil {
- return nil, err
- }
+ passageChan <- passage
+ return nil
+ }))
}
- return fragments, nil
+ return tasks, nil
}
-func compileFragmentsFeed(fragments []*Fragment) error {
- feed := &atom.Feed{
- Title: "Fragments - brandur.org",
- ID: "tag:brandur.org.org,2013:/fragments",
-
- Links: []*atom.Link{
- {Rel: "self", Type: "application/atom+xml", Href: "https://brandur.org/fragments.atom"},
- {Rel: "alternate", Type: "text/html", Href: "https://brandur.org"},
- },
- }
+//
+// Other functions
+//
+// Any other functions. Try to keep them alphabetized.
+//
- if len(fragments) > 0 {
- feed.Updated = *fragments[0].PublishedAt
- }
+func accumulateArticles(articles *[]*Article) chan *Article {
+ articleChan := make(chan *Article, 100)
+ go func() {
+ for article := range articleChan {
+ *articles = append(*articles, article)
+ }
+ }()
+ return articleChan
+}
- for _, fragment := range fragments {
- entry := &atom.Entry{
- Title: fragment.Title,
- Content: &atom.EntryContent{Content: fragment.Content, Type: "html"},
- Published: *fragment.PublishedAt,
- Updated: *fragment.PublishedAt,
- Link: &atom.Link{Href: conf.SiteURL + "/fragments/" + fragment.Slug},
- ID: "tag:brandur.org," + fragment.PublishedAt.Format("2006-01-02") + ":fragments/" + fragment.Slug,
+func accumulateFragments(fragments *[]*Fragment) chan *Fragment {
+ fragmentChan := make(chan *Fragment, 100)
+ go func() {
+ for fragment := range fragmentChan {
+ *fragments = append(*fragments, fragment)
+ }
+ }()
+ return fragmentChan
+}
- AuthorName: conf.AtomAuthorName,
- AuthorURI: conf.AtomAuthorURL,
+func accumulatePassages(p *[]*passages.Passage) chan *passages.Passage {
+ passageChan := make(chan *passages.Passage, 100)
+ go func() {
+ for passage := range passageChan {
+ *p = append(*p, passage)
}
- feed.Entries = append(feed.Entries, entry)
+ }()
+ return passageChan
+}
+
+// Naturally not provided by the Go language because copying files "has tricky
+// edge cases". You just can't make this stuff up.
+func copyFile(src, dst string) error {
+ in, err := os.Open(src)
+ if err != nil {
+ return err
}
+ defer in.Close()
- f, err := os.Create(sorg.TargetDir + "/fragments.atom")
+ out, err := os.Create(dst)
if err != nil {
return err
}
- defer f.Close()
+ defer out.Close()
- return feed.Encode(f, " ")
+ _, err = io.Copy(out, in)
+ return err
+}
+
+// Just a shortcut to try and cut down on Go's extreme verbosity.
+func fileExists(file string) bool {
+ _, err := os.Stat(file)
+ if err == nil {
+ return true
+ }
+ if os.IsNotExist(err) {
+ return false
+ }
+ panic(err)
}
// Gets a map of local values for use while rendering a template and includes
@@ -1110,8 +1516,10 @@ func getLocals(title string, locals map[string]interface{}) map[string]interface
defaults := map[string]interface{}{
"BodyClass": "",
"GoogleAnalyticsID": conf.GoogleAnalyticsID,
+ "LocalFonts": conf.LocalFonts,
"Release": sorg.Release,
"Title": title,
+ "TwitterCard": nil,
"ViewportWidth": "device-width",
}
@@ -1143,7 +1551,7 @@ func getPhotosData(db *sql.DB) ([]*Photo, error) {
WHERE type = 'flickr'
AND (metadata -> 'medium_width')::int = 500
ORDER BY occurred_at DESC
- LIMIT 30
+ LIMIT 90
`)
if err != nil {
return nil, err
@@ -1457,7 +1865,8 @@ func getRunsLastYearData(db *sql.DB) ([]string, []float64, error) {
SUM(distance) AS distance
FROM runs
WHERE occurred_at_local > NOW() - '180 days'::interval
- GROUP BY day
+ GROUP BY day
+ ORDER BY day
),
-- generates a baseline series of every day in the last 180 days
@@ -1470,11 +1879,10 @@ func getRunsLastYearData(db *sql.DB) ([]string, []float64, error) {
NOW(), '1 day'::interval) i
)
- SELECT to_char(d.day, 'Mon DD') AS day,
+ SELECT to_char(d.day, 'Mon') AS day,
d.distance + COALESCE(rd.distance, 0::float)
FROM days d
LEFT JOIN runs_days rd ON d.day = rd.day
- ORDER BY day ASC
`)
if err != nil {
return nil, nil, err
@@ -1599,6 +2007,38 @@ func getTwitterData(db *sql.DB, withReplies bool) ([]*Tweet, error) {
return tweets, nil
}
+func groupArticlesByYear(articles []*Article) []*articleYear {
+ var year *articleYear
+ var years []*articleYear
+
+ for _, article := range articles {
+ if year == nil || year.Year != article.PublishedAt.Year() {
+ year = &articleYear{article.PublishedAt.Year(), nil}
+ years = append(years, year)
+ }
+
+ year.Articles = append(year.Articles, article)
+ }
+
+ return years
+}
+
+func groupFragmentsByYear(fragments []*Fragment) []*fragmentYear {
+ var year *fragmentYear
+ var years []*fragmentYear
+
+ for _, fragment := range fragments {
+ if year == nil || year.Year != fragment.PublishedAt.Year() {
+ year = &fragmentYear{fragment.PublishedAt.Year(), nil}
+ years = append(years, year)
+ }
+
+ year.Fragments = append(year.Fragments, fragment)
+ }
+
+ return years
+}
+
func groupReadingsByYear(readings []*Reading) []*readingYear {
var year *readingYear
var years []*readingYear
@@ -1638,21 +2078,35 @@ func groupTwitterByYearAndMonth(tweets []*Tweet) []*tweetYear {
return years
}
+// Detects a hidden file, i.e. one that starts with a dot.
func isHidden(file string) bool {
return strings.HasPrefix(file, ".")
}
-func ensureSymbolicLink(source, dest string) error {
+func ensureSymlink(source, dest string) error {
log.Debugf("Checking symbolic link (%v): %v -> %v",
path.Base(source), source, dest)
+ var actual string
+
_, err := os.Stat(dest)
+
+ // Note that if a symlink file does exist, but points to a non-existent
+ // location, we still get an "does not exist" error back, so we fall down
+ // to the general create path so that the symlink file can be removed.
+ //
+ // The call to RemoveAll does not affect the other path of the symlink file
+ // not being present because it doesn't care whether or not the file it's
+ // trying remove is actually there.
if os.IsNotExist(err) {
log.Debugf("Destination link does not exist. Creating.")
- return os.Symlink(source, dest)
+ goto create
+ }
+ if err != nil {
+ return err
}
- actual, err := os.Readlink(dest)
+ actual, err = os.Readlink(dest)
if err != nil {
return err
}
@@ -1664,6 +2118,7 @@ func ensureSymbolicLink(source, dest string) error {
log.Debugf("Destination links to wrong source. Creating.")
+create:
err = os.RemoveAll(dest)
if err != nil {
return err
@@ -1672,6 +2127,25 @@ func ensureSymbolicLink(source, dest string) error {
return os.Symlink(source, dest)
}
+// Checks if the path exists as a common image format (.jpg or .png only). If
+// so, returns the discovered extension (e.g. "jpg") and boolean true.
+// Otherwise returns an empty string and boolean false.
+func pathAsImage(extensionlessPath string) (string, bool) {
+ // extensions must be lowercased
+ formats := []string{"jpg", "png"}
+
+ for _, format := range formats {
+ _, err := os.Stat(extensionlessPath + "." + format)
+ if err != nil {
+ continue
+ }
+
+ return format, true
+ }
+
+ return "", false
+}
+
func renderView(layout, view, target string, locals map[string]interface{}) error {
log.Debugf("Rendering: %v", target)
@@ -1697,16 +2171,26 @@ func renderView(layout, view, target string, locals map[string]interface{}) erro
return nil
}
-func splitFrontmatter(content string) (string, string, error) {
- parts := regexp.MustCompile("(?m)^---").Split(content, 3)
-
- if len(parts) > 1 && parts[0] != "" {
- return "", "", errBadFrontmatter
- } else if len(parts) == 2 {
- return "", strings.TrimSpace(parts[1]), nil
- } else if len(parts) == 3 {
- return strings.TrimSpace(parts[1]), strings.TrimSpace(parts[2]), nil
+// Runs the given tasks in a pool.
+//
+// After the run, if any errors occurred, it prints the first 10. Returns true
+// if all tasks succeeded. If a false is returned, the caller should consider
+// exiting with non-zero status.
+func runTasks(tasks []*pool.Task) bool {
+ p := pool.NewPool(tasks, conf.Concurrency)
+ p.Run()
+
+ var numErrors int
+ for _, task := range p.Tasks {
+ if task.Err != nil {
+ log.Error(task.Err)
+ numErrors++
+ }
+ if numErrors >= 10 {
+ log.Error("Too many errors.")
+ break
+ }
}
- return "", strings.TrimSpace(parts[0]), nil
+ return !p.HasErrors()
}
diff --git a/cmd/sorg-build/main_test.go b/cmd/sorg-build/main_test.go
index 0e28e415c..20b35a647 100644
--- a/cmd/sorg-build/main_test.go
+++ b/cmd/sorg-build/main_test.go
@@ -1,12 +1,211 @@
package main
import (
+ "database/sql"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path"
"testing"
+ "time"
"github.com/brandur/sorg"
+ "github.com/brandur/sorg/pool"
+ _ "github.com/brandur/sorg/testing"
+ _ "github.com/lib/pq"
assert "github.com/stretchr/testify/require"
)
+var db *sql.DB
+
+func init() {
+ conf.TargetDir = "./public"
+ err := sorg.CreateOutputDirs(conf.TargetDir)
+ if err != nil {
+ panic(err)
+ }
+
+ db, err = sql.Open("postgres", "postgres://localhost/sorg-test?sslmode=disable")
+ if err != nil {
+ panic(err)
+ }
+}
+
+func TestCompilePhotos(t *testing.T) {
+ //
+ // No database
+ //
+
+ photos, err := compilePhotos(nil)
+ assert.NoError(t, err)
+ assert.Equal(t, []*Photo(nil), photos)
+
+ //
+ // With empty database
+ //
+
+ photos, err = compilePhotos(db)
+ assert.NoError(t, err)
+ assert.Equal(t, []*Photo(nil), photos)
+
+ //
+ // With results
+ //
+
+ // TODO: insert photos
+ //photos, err = compilePhotos(db)
+ //assert.NoError(t, err)
+}
+
+func TestCompileReading(t *testing.T) {
+ //
+ // No database
+ //
+
+ err := compileReading(nil)
+ assert.NoError(t, err)
+
+ //
+ // With empty database
+ //
+
+ err = compileReading(db)
+ assert.NoError(t, err)
+
+ //
+ // With results
+ //
+
+ // TODO: insert reading
+ //err = compileReading(db)
+ //assert.NoError(t, err)
+}
+
+func TestCompileRobots(t *testing.T) {
+ dir, err := ioutil.TempDir("", "target")
+ assert.NoError(t, err)
+ path := path.Join(dir, "robots.txt")
+
+ conf.Drafts = false
+ err = compileRobots(path)
+ assert.NoError(t, err)
+
+ _, err = os.Stat(path)
+ assert.True(t, os.IsNotExist(err))
+
+ conf.Drafts = true
+ err = compileRobots(path)
+ assert.NoError(t, err)
+
+ _, err = os.Stat(path)
+ assert.NoError(t, err)
+}
+
+func TestCompileRuns(t *testing.T) {
+ //
+ // No database
+ //
+
+ err := compileRuns(nil)
+ assert.NoError(t, err)
+
+ //
+ // With empty database
+ //
+
+ err = compileRuns(db)
+ assert.NoError(t, err)
+
+ //
+ // With results
+ //
+
+ // TODO: insert runs
+ //err = compileRuns(db)
+ //assert.NoError(t, err)
+}
+
+func TestCompileTwitter(t *testing.T) {
+ //
+ // No database
+ //
+
+ err := compileTwitter(nil)
+ assert.NoError(t, err)
+
+ //
+ // With empty database
+ //
+
+ err = compileTwitter(db)
+ assert.NoError(t, err)
+
+ //
+ // With results
+ //
+
+ now := time.Now()
+ tweet := &Tweet{
+ Content: "Hello, world!",
+ OccurredAt: &now,
+ Slug: "1234",
+ }
+ insertTweet(t, tweet, false)
+
+ err = compileTwitter(db)
+ assert.NoError(t, err)
+}
+
+func TestEnsureSymlink(t *testing.T) {
+ dir, err := ioutil.TempDir("", "symlink")
+ assert.NoError(t, err)
+
+ source := path.Join(dir, "source")
+ err = ioutil.WriteFile(source, []byte("source"), 0755)
+ assert.NoError(t, err)
+
+ dest := path.Join(dir, "symlink-dest")
+
+ //
+ // Case 1: Symlink does not exist
+ //
+
+ err = ensureSymlink(source, dest)
+ assert.NoError(t, err)
+
+ actual, err := os.Readlink(dest)
+ assert.Equal(t, source, actual)
+
+ //
+ // Case 2: Symlink does exist
+ //
+ // Consists solely of re-running the previous test case.
+ //
+
+ err = ensureSymlink(source, dest)
+ assert.NoError(t, err)
+
+ actual, err = os.Readlink(dest)
+ assert.Equal(t, source, actual)
+
+ //
+ // Case 3: Symlink file exists, but source doesn't
+ //
+
+ err = os.RemoveAll(dest)
+ assert.NoError(t, err)
+
+ source = path.Join(dir, "source")
+ err = ioutil.WriteFile(source, []byte("source"), 0755)
+ assert.NoError(t, err)
+
+ err = ensureSymlink(source, dest)
+ assert.NoError(t, err)
+
+ actual, err = os.Readlink(dest)
+ assert.Equal(t, source, actual)
+}
+
func TestGetLocals(t *testing.T) {
locals := getLocals("Title", map[string]interface{}{
"Foo": "Bar",
@@ -22,31 +221,59 @@ func TestIsHidden(t *testing.T) {
assert.Equal(t, false, isHidden("article"))
}
-func TestSplitFrontmatter(t *testing.T) {
- frontmatter, content, err := splitFrontmatter(`---
-foo: bar
----
+func TestRunTasks(t *testing.T) {
+ conf.Concurrency = 3
-other`)
- assert.NoError(t, err)
- assert.Equal(t, "foo: bar", frontmatter)
- assert.Equal(t, "other", content)
+ //
+ // Success case
+ //
- frontmatter, content, err = splitFrontmatter(`other`)
- assert.NoError(t, err)
- assert.Equal(t, "", frontmatter)
- assert.Equal(t, "other", content)
+ tasks := []*pool.Task{
+ pool.NewTask(func() error { return nil }),
+ pool.NewTask(func() error { return nil }),
+ pool.NewTask(func() error { return nil }),
+ }
+ assert.Equal(t, true, runTasks(tasks))
- frontmatter, content, err = splitFrontmatter(`---
-foo: bar
----
-`)
- assert.NoError(t, err)
- assert.Equal(t, "foo: bar", frontmatter)
- assert.Equal(t, "", content)
+ //
+ // Failure case (1 error)
+ //
- frontmatter, content, err = splitFrontmatter(`foo: bar
----
-`)
- assert.Equal(t, errBadFrontmatter, err)
+ tasks = []*pool.Task{
+ pool.NewTask(func() error { return nil }),
+ pool.NewTask(func() error { return nil }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ }
+ assert.Equal(t, false, runTasks(tasks))
+
+ //
+ // Failure case (11 errors)
+ //
+ // Here we'll exit with a "too many errors" message.
+ //
+
+ tasks = []*pool.Task{
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ pool.NewTask(func() error { return fmt.Errorf("error") }),
+ }
+ assert.Equal(t, false, runTasks(tasks))
+}
+
+func insertTweet(t *testing.T, tweet *Tweet, reply bool) {
+ _, err := db.Exec(`
+ INSERT INTO events
+ (content, occurred_at, metadata, slug, type)
+ VALUES
+ ($1, $2, hstore('reply', $3), $4, $5)
+ `, tweet.Content, tweet.OccurredAt, reply, tweet.Slug, "twitter")
+ assert.NoError(t, err)
}
diff --git a/cmd/sorg-passages/main.go b/cmd/sorg-passages/main.go
new file mode 100644
index 000000000..3a0b4fb8f
--- /dev/null
+++ b/cmd/sorg-passages/main.go
@@ -0,0 +1,137 @@
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "flag"
+ "fmt"
+ "log"
+ "os"
+ "path/filepath"
+
+ "github.com/aymerick/douceur/inliner"
+ "github.com/brandur/sorg"
+ "github.com/brandur/sorg/passages"
+ "github.com/brandur/sorg/templatehelpers"
+ "github.com/joeshaw/envdecode"
+ "github.com/yosssi/ace"
+ "gopkg.in/mailgun/mailgun-go.v1"
+)
+
+const (
+ mailDomain = "list.brandur.org"
+ fromAddress = "Brandur <" + listAddress + ">"
+ listAddress = "passages@" + mailDomain
+ listStagingAddress = "passages-staging@" + mailDomain
+ replyToAddress = "brandur@brandur.org"
+ testAddress = replyToAddress
+)
+
+// Conf contains configuration information for the command. It's extracted from
+// environment variables.
+type Conf struct {
+ // MailgunAPIKey is a key for Mailgun used to send email.
+ MailgunAPIKey string `env:"MAILGUN_API_KEY,required"`
+}
+
+// Left as a global for now for the sake of convenience, but it's not used in
+// very many places and can probably be refactored as a local if desired.
+var conf Conf
+
+func renderAndSend(path string, live, staging bool) error {
+ dir := filepath.Dir(path)
+ name := filepath.Base(path)
+
+ passage, err := passages.Compile(dir, name, false, true)
+ if err != nil {
+ return err
+ }
+
+ locals := map[string]interface{}{
+ "InEmail": true,
+ "Passage": passage,
+ "Title": passage.Title,
+ }
+
+ template, err := ace.Load(
+ sorg.PassageLayout,
+ sorg.ViewsDir+"/passages/show",
+ &ace.Options{FuncMap: templatehelpers.FuncMap})
+ if err != nil {
+ return err
+ }
+
+ var b bytes.Buffer
+
+ writer := bufio.NewWriter(&b)
+
+ err = template.Execute(writer, locals)
+ if err != nil {
+ return err
+ }
+
+ writer.Flush()
+
+ html, err := inliner.Inline(b.String())
+ if err != nil {
+ return err
+ }
+
+ var recipient string
+ if live {
+ recipient = listAddress
+ } else if staging {
+ recipient = listStagingAddress
+ } else {
+ recipient = testAddress
+ }
+
+ mg := mailgun.NewMailgun(mailDomain, conf.MailgunAPIKey, "")
+
+ subject := fmt.Sprintf("Passages & Glass %s — %s",
+ passage.Issue, passage.Title)
+
+ message := mailgun.NewMessage(
+ fromAddress,
+ subject,
+ passage.ContentRaw,
+ recipient)
+ message.SetReplyTo(replyToAddress)
+ message.SetHtml(html)
+
+ resp, _, err := mg.Send(message)
+ if err != nil {
+ log.Fatal(err)
+ }
+ log.Printf(`Sent to: %s (response: "%s")`, recipient, resp)
+
+ return nil
+}
+
+func main() {
+ flag.Usage = func() {
+ fmt.Fprintf(os.Stderr, "Usage: %v [-live] [-staging] \n", os.Args[0])
+ flag.PrintDefaults()
+ os.Exit(0)
+ }
+
+ live := flag.Bool("live", false,
+ "Send to list (as opposed to dry run)")
+ staging := flag.Bool("staging", false,
+ "Send to staging list (as opposed to dry run)")
+ flag.Parse()
+
+ if len(flag.Args()) != 1 {
+ flag.Usage()
+ }
+
+ err := envdecode.Decode(&conf)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ err = renderAndSend(flag.Arg(0), *live, *staging)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
diff --git a/cmd/sorg-serve/main.go b/cmd/sorg-serve/main.go
index e1188489a..b0b94a66b 100644
--- a/cmd/sorg-serve/main.go
+++ b/cmd/sorg-serve/main.go
@@ -14,31 +14,37 @@ import (
type Conf struct {
// Port is the port on which the command will serve the site over HTTP.
Port int `env:"PORT,default=5001"`
+
+ // TargetDir is the target location where the site was built to.
+ TargetDir string `env:"TARGET_DIR,default=./public"`
}
+// Left as a global for now for the sake of convenience, but it's not used in
+// very many places and can probably be refactored as a local if desired.
+var conf Conf
+
func main() {
sorg.InitLog(false)
- var conf Conf
err := envdecode.Decode(&conf)
if err != nil {
log.Fatal(err)
}
- err = sorg.CreateTargetDirs()
+ err = sorg.CreateOutputDirs(conf.TargetDir)
if err != nil {
log.Fatal(err)
}
- err = serve(conf.Port)
+ err = serve(conf.TargetDir, conf.Port)
if err != nil {
log.Fatal(err)
}
}
-func serve(port int) error {
- log.Infof("Serving '%v' on port %v", path.Clean(sorg.TargetDir), port)
+func serve(targetDir string, port int) error {
+ log.Infof("Serving '%v' on port %v", path.Clean(targetDir), port)
log.Infof("Open browser to: http://localhost:%v/", port)
- handler := http.FileServer(http.Dir(sorg.TargetDir))
+ handler := http.FileServer(http.Dir(targetDir))
return http.ListenAndServe(":"+strconv.Itoa(port), handler)
}
diff --git a/cmd/sorg-serve/main_test.go b/cmd/sorg-serve/main_test.go
new file mode 100644
index 000000000..59430d092
--- /dev/null
+++ b/cmd/sorg-serve/main_test.go
@@ -0,0 +1,11 @@
+package main
+
+import (
+ "testing"
+)
+
+func TestMain(t *testing.T) {
+ // Unfortunately, this package is somewhat difficult to test because
+ // there's no way to shut down a server's ListenAndServe. There's not that
+ // much code in here so it's not too bad.
+}
diff --git a/content/articles/accessible-apis.md b/content/articles/accessible-apis.md
new file mode 100644
index 000000000..3ffecd03e
--- /dev/null
+++ b/content/articles/accessible-apis.md
@@ -0,0 +1,165 @@
+---
+hook: A set of patterns to make APIs more accessible to developers; lowering the barrier
+ of entry for new users, and easing the maintenance of consuming applications.
+location: San Francisco
+published_at: 2013-09-01T15:59:44Z
+title: Developer Accessible APIs
+---
+
+Many providers get users started off on their API by pointing them to an
+extensive set of documents, designed to help build a conceptual model for those
+users to leverage later when they start to build something with it. Because
+asking users to consume significant documentation upfront is a lot to ask these
+days, a common pattern is to include a set of examples and tutorials to help
+get users up and running as quickly possible. These are effective and
+time-proven techniques, but there's still room for improvement. Consider these
+problems:
+
+* API behavior must be documented exhaustively as developers have no easy way
+ of checking particular cases for themselves. This documentation is expensive
+ to create and maintain, and without a rigid process to keep it updated, will
+ inevitably fall into disrepair.
+* Documentation will never be able to cover every possible case. At the end of
+ the day, the best way to be sure of the API's behavior in some corner case is
+ to try the API itself.
+* Tutorials and examples must often make assumptions based on what languages
+ and tools users are expected to use. Users left without relevant documents will
+ often have to project one of the available documents onto their own toolset,
+ for example a developer coding in Clojure and following a Ruby tutorial.
+
+It's possible to solve some of these problems by ensuring that APIs are
+**developer accessible**, meaning that rather than only optimizing for the case
+of those applications that will be consuming them over the long run, they also
+cater to those developers that are learning the API in order to build new
+products with it. Developers using the API this way will be coming in and
+making manual one-off calls with their tools of choice, before transcribing
+those calls to the more permanent medium of whatever applications they build.
+
+This kind of accessibility isn't just good for jumpstarting new developers
+either. As the existing API economy becomes ever more prominent, we have to
+consider that over a long enough period of time, changes in either clients or
+the APIs themselves will inexorably lead to breakages. In such situations, we
+should aim to make it as easy as possible for a developer to jump in with their
+toolkit and quickly figure out what's going on so that the problem can be
+remedied.
+
+Developer accessibility is more of an idea than any particular method, and as
+such there's no definite way of implementing it, but a number of patterns that
+we see in the wild can help to illustrate the concept.
+
+## Patterns (#patterns)
+
+### OAuth 2 (#oauth2)
+
+Much of the added complexity around OAuth 1 stems from the extra layer of
+security that was built on top of it. OAuth 2 simply relies on HTTPS to take
+care of this, and the result is a much more usable protocol. While OAuth 1 APIs
+often need to be called through specialized consumer libraries, calls to OAuth
+2 APIs can generally be made with any generic client, including plain old Curl,
+which significantly lowers the barrier of entry to an API.
+
+Even Twitter, well known for its restrictive APIs [has an easy way of procuring
+an OAuth 2 access token](https://gist.github.com/brandur/5845931).
+
+### Bearer token authorization (#bearer-tokens)
+
+A very simple pattern for use with OAuth 2 is allowing users to authorize with
+a bearer token via the `Authorization` header. This ensures that any client
+that can send an HTTP header has an easy way in without needing to do
+base64-encoding.
+
+ curl -H "Authorization: Bearer 01234567-89ab-cdef-0123-456789abcdef" ...
+
+### Curlability (#curl)
+
+A consistent theme across many patterns is simply that an API should be
+accessible to any generic HTTP client, with Curl occupying the place of that
+baseline tool in many of our kits due to its relative ubiquity. Good Curl
+accessibility is useful for both new developers who can start experimenting
+with an API immediately, and for the API owners themselves, who can take
+advantage of it during the development of new API features as well.
+
+A very simple pattern of immediately improving an API's Curlabiliity is to
+prettify JSON output for Curl clients as [I've previously
+described](https://mutelight.org/pretty-json).
+
+### In-response scope hints (#scope-hints)
+
+It can be quite helpful to return metadata about the current request and the
+current endpoint for a developer to digest while they're testing calls against
+an API.
+
+For example, a fairly general problem with APIs being consumed by OAuth-enabled
+apps is that apps will often request scopes with more liberal permissions than
+the app actually needs, which isn't ideal from a security perspective. By
+returning a header like `OAuth-Scope-Accepted` below, we give developers an
+easy way to determine what permissisions are needed on the endpoints they're
+accessing, allowing them to lock down their scope before releasing an app.
+
+ Oauth-Scope: global
+ Oauth-Scope-Accepted: global identity
+
+### In-response ordering hints (#ordering-hints)
+
+For our [V3 platform
+API](https://devcenter.heroku.com/articles/platform-api-reference) at Heroku,
+list ordering is accomplished by specifying that order through a `Range`
+header, but ordering can only be carried out on particular fields. Those fields
+can either be looked up in the reference documentation, or a developer can
+easily check which ones are supported by inspecting the `Accept-Ranges` header
+that comes back with list responses:
+
+ Accept-Ranges: id, name
+ Range: id ..
+
+### Ship a service stub (#service-stubs)
+
+I've previously talked about how Rack service stubs can be [used to improve the
+development and testing experience](https://brandur.org/service-stubs) of apps
+that are heaviliy dependent on external APIs. An API can also ship its own
+service stub, which allows developers to try API calls that might otherwise
+mutate data when done in a production environment. See the [Heroku API
+stub](https://github.com/heroku/heroku-api-stub) for an example of this
+technique.
+
+### Programmatic maps (#programmatic-maps)
+
+An interesting Hypermedia-related technique that's gaining some traction is to
+provide a set of links at an API's root that point to other available
+endpoints. Coupled with strong RESTful conventions, this might allow a
+developer to skip the reference documentation completely by learning the API by
+navigating around it with Curl.
+
+Try GitHub's root to see this in the real world:
+
+ curl https://api.github.com
+
+ {
+ "current_user_url": "https://api.github.com/user",
+ "authorizations_url": "https://api.github.com/authorizations",
+ "emails_url": "https://api.github.com/user/emails",
+ "emojis_url": "https://api.github.com/emojis",
+ "events_url": "https://api.github.com/events",
+ "feeds_url": "https://api.github.com/feeds",
+ "following_url": "https://api.github.com/user/following{/target}",
+ "gists_url": "https://api.github.com/gists{/gist_id}",
+ "hub_url": "https://api.github.com/hub",
+ "issue_search_url": "https://api.github.com/legacy/issues/search/{owner}/{repo}/{state}/{keyword}",
+ "issues_url": "https://api.github.com/issues",
+ "keys_url": "https://api.github.com/user/keys",
+ "notifications_url": "https://api.github.com/notifications",
+ "organization_repositories_url": "https://api.github.com/orgs/{org}/repos/{?type,page,per_page,sort}",
+ "organization_url": "https://api.github.com/orgs/{org}",
+ "public_gists_url": "https://api.github.com/gists/public",
+ "rate_limit_url": "https://api.github.com/rate_limit",
+ "repository_url": "https://api.github.com/repos/{owner}/{repo}",
+ "repository_search_url": "https://api.github.com/legacy/repos/search/{keyword}{?language,start_page}",
+ "current_user_repositories_url": "https://api.github.com/user/repos{?type,page,per_page,sort}",
+ "starred_url": "https://api.github.com/user/starred{/owner}{/repo}",
+ "starred_gists_url": "https://api.github.com/gists/starred",
+ "team_url": "https://api.github.com/teams",
+ "user_url": "https://api.github.com/users/{user}",
+ "user_organizations_url": "https://api.github.com/user/orgs",
+ "user_repositories_url": "https://api.github.com/users/{user}/repos{?type,page,per_page,sort}",
+ "user_search_url": "https://api.github.com/legacy/user/search/{keyword}"
+ }
diff --git a/content/articles/accidental-evangelist.md b/content/articles/accidental-evangelist.md
new file mode 100644
index 000000000..51dfe4bc9
--- /dev/null
+++ b/content/articles/accidental-evangelist.md
@@ -0,0 +1,62 @@
+---
+title: The Accidental Evangelist
+published_at: 2017-05-28T18:22:41Z
+hook: Lets talk about the bad aspects of the technology we
+ use as well as the good. Better information will lead to
+ better decisions in the next generation of technical
+ architecture.
+---
+
+In our industry, technological hype has a bad tendency to
+work like a ratchet. We sing the praises of the various
+systems we use, but even after using them for a while and
+seeing that the hype doesn't match reality, we don't knock
+them back down.
+
+Over the last few decades we've seen big pushes for tools
+like XML, Scala, PHP, MySQL, CouchDB, Erlang, Mnesia,
+CoffeeScript, Node, Rails, Django, MongoDB, Meteor, Ember,
+Angular, React, Riak, Clojure, Cassandra, Go, and countless
+others. More recently, the industry's seen enough
+saturation that fatigue has lead to a bit more skepticism,
+but we can still see the same signs of of ongoing pushes,
+even if somewhat more tempered (e.g. Rust, Elixir, Kotlin,
+Crystal, ...).
+
+Some of these tools have withstood the test of time and are
+still fine choices for a new technology stack. On the other
+hand, a good number of them are _not_. Trade offs aside,
+existential problems like a pathologic lack of runtime
+safety, a decaying ecosystem, or a design that leads to
+burdensome operation become apparent with use, and make
+their avoidance perfectly rational. Sometimes we make
+mistakes, or the state of the art improves, and ideas that
+we originally thought to be good hit the end of their
+useful lifetime.
+
+It's a very human thing to do to withhold criticism. There
+are humans on the other end of all this technology, and
+frequently they're even humans that we know and like. It's
+also rarely personally helpful to speak ill of systems that
+realistically speaking, we're going to be married to for some
+time to come.
+
+But we should consider the ramifications of staying silent --
+by withholding information on the bad technology in our
+stacks, we're cheating people and companies who aren't
+using it yet, but may yet adopt its use. By articulating
+problems and trade offs, you'll better inform decisions and
+potentially save _millions_ of hours of future productivity
+that would have otherwise been sacrificed to the altar.
+
+Disastrous pitfalls, vampiric operational overhead, chronic
+underdesign, or even just obsolescence are never in the
+manual, and often not obvious until you're already waist
+deep. By being on the inside of these things, you have
+access to special insight that other people can't get
+without fully investing themselves at great expense.
+
+This isn't to say that we should unduly sling mud, but
+pieces that are honest, detail-oriented, thoroughly
+researched, but also critical, might be the best way that
+you can help your fellow builders.
diff --git a/content/articles/acid.md b/content/articles/acid.md
new file mode 100644
index 000000000..eacbe5744
--- /dev/null
+++ b/content/articles/acid.md
@@ -0,0 +1,391 @@
+---
+title: Building Robust Systems With ACID and Constraints
+published_at: 2017-05-16T14:03:01Z
+location: San Francisco
+hook: On ensuring system integrity, operability, and
+ correctness through a solid foundational database, and
+ how ACID transactions and strong constraints work in your
+ favor. Why to prefer Postgres over MongoDB.
+---
+
+In 1983, Andreas Reuter and Theo Härder coined the acronym
+ACID as shorthand for _atomicity_, _consistency_,
+_isolation_, and _durability_. They were building on
+earlier work by Jim Gray who'd proposed atomicity,
+consistency, and durability, but had initially left out the
+_I_. ACID is one of those inventions of the 80s that's not
+only just still in use in the form of major database
+systems like Postgres, Oracle, and MSSQL, but which has
+never been displaced by a better idea.
+
+In the last decade we've seen the emergence of a number of
+new flavors of data store that come with untraditional
+features like streaming changesets, JavaScript APIs, or
+nestable JSON documents. Most of them assume that the need
+for horizontal partitioning is a given in this day and age
+and therefore ACID is put to the altar (this doesn't
+necessarily have to be the case, see below). Every decision
+comes with trade offs, but trading away these powerful
+guarantees for the novelties du jour or an unexamined
+assumption that horizontal scaling will very soon be a
+critically required feature is as raw of a deal as you'll
+ever see in the technical world.
+
+But why the mismatch in values? I think it's because many
+of us have taught ourselves programming on frameworks like
+Rails, or been educated in environments where ACID
+databases were a part of the package, and we've taken them
+for granted. They've always been there, and we've never
+necessarily considered closely exactly what they can do for
+us and why they're important. In many cases this also leads
+to their most powerful features being underutilized.
+
+I want to convince you that ACID databases are one of the
+most important tools in existence for ensuring
+maintainability and data correctness in big production
+systems. Lets start by digging into each of their namesake
+guarantees.
+
+## Atomicity (#atomicity)
+
+The "A" in ACID. It dictates that within a given database
+transaction, the changes to be committed will be all or
+nothing. If the transaction fails partway through, the
+initial database state is left unchanged.
+
+Software is buggy by nature and introducing problems that
+unintentionally fail some operations is inevitable. Any
+sufficiently large program is eventually going to want to
+have an operation that writes two or more objects
+consecutively, and by wrapping that operation in a
+transaction, we get to ensure that even in the these worst
+case scenarios state is left undamaged. Every subsequent
+operation will start with safe initial state.
+
+It's never desirable to fail transactions that we hoped to
+commit, but atomicity cancels the expensive fallout.
+
+!fig src="/assets/acid/transactions-in-requests.svg" caption="Some requests. Each wraps its database operations using an atomic transaction so that they either all commit, or none of them do."
+
+### The janitorial team (#janitorial-team)
+
+Many products will claim "document-level atomicity" (e.g.
+MongoDB, RethinkDB, CouchBase, ...) which means that
+writing any one row is atomic, but nothing beyond that.
+What happens in a world like this where any failed
+operation leaves invalid state behind?
+
+The default will be that a subsequent retry won't be able
+to reconcile the broken state, and that the data will need
+to be repaired before it's usable again.
+
+Here's an example of a simple GitHub-like service. When a
+user opens a pull request, we have a number of objects that
+we have to save in succession before finishing the request:
+a pull request modeling the created resource, a webhook to
+fire off to any listeners on the repository, a reviewer
+record mapping to whomever we've assigned review, and an
+event to store in the audit log.
+
+!fig src="/assets/acid/request-failure.svg" caption="Demonstration of how without an atomicity guarantee, a failed request results in an invalid state of data."
+
+A request that fails after the first two saves fails to
+create a valid set of objects, but with transactional
+atomicity can't revert the changes it did make. The result?
+An invalid pull request. A subsequent request that tries to
+look it up might error as the code tries to load state that
+was only partially created.
+
+You might hope that projects in this position would have
+automated protections in place to try and roll back bad
+partial transactions. While this may exist somewhere, it's
+much more likely that the overarching strategy is an
+optimistic sense of hope that these kinds of problems won't
+happen very often. Code paths begin to mutate to load data
+defensively so that they handle an exponential number of
+combinations of bad state that have accumulated in the data
+store over time.
+
+Bad incidents will necessitate heavy manual intervention by
+operators, or even a specially crafted "fixer script" to
+clean up state and get everything back to normal. After a
+certain size, this sort of thing will be happening
+frequently, and your engineers will start to spend less
+time as engineers, and more time as data janitors.
+
+!fig src="/assets/acid/pillars.jpg" caption="A grid of pillars at the Jewish Museum in Berlin. Real world consistency at its best."
+
+## Consistency (#consistency)
+
+The "C" in ACID. It dictates that every transaction will
+bring a database from one valid state to another valid
+state; there's no potential for anything in between.
+
+It might be a little hard to imagine what this can do for a
+real world app in practice, but consider one the very
+common case where a user signs up for a service with their
+email address `foo@example.com`. We don't want to two
+accounts with the same email in the system, so when
+creating the account we'd use an initial check:
+
+1. Look for any existing `foo@example.com` users in the
+ system. If there is one, reject the request.
+
+2. Create a new record for `foo@example.com`.
+
+Regardless of data store, this will generally work just
+fine until you have a system with enough traffic to start
+revealing edge cases. If we have two requests trying to
+register `foo@example.com` that are running almost
+concurrently, then the above check can fail us because both
+could have validated step one successfully before moving on
+to create a duplicated record.
+
+!fig src="/assets/acid/consistency.svg" caption="Without guaranteed consistency, there's nothing to stop the database from transitioning to an invalid state."
+
+You can solve this problem on an ACID database in multiple
+ways:
+
+1. You could use a strong isolation level like
+ `SERIALIZABLE` on your transactions, which would
+ guarantee that only one `foo@example.com` creation would
+ be allowed to commit.
+
+2. You could put a uniqueness check on the table itself (or
+ on an index) which would prevent a duplicate record from
+ being inserted.
+
+### Fix it later. Maybe. (#later-maybe)
+
+Without ACID, its up to your application code to solve the
+problem. You could implement some a locking system of sorts
+to guarantee that only one registration for any given email
+address can be in flight at once. Realistically, many
+providers on non-ACID databases will probably elect to just
+not solve the problem. Maybe later, _after_ it causes
+painful fall out in production.
+
+## Isolation (#isolation)
+
+The "I" in ACID. It ensures that two simultaneously
+executing transactions that are operating on the same
+information don't conflict with each other. Each one has
+access to a pristine view of the data (depending on
+isolation level) even if the other has modified it, and
+results are reconciled when the transactions are ready to
+commit. Modern RDMSes have sophisticated multiversion
+concurrency control systems that make this possible in ways
+that are correct and efficient.
+
+
+
+
+
+
Isolation Level
+
Dirty Read
+
Nonrepeatable Read
+
Phantom Read
+
Serialization Anomaly
+
+
+
Read uncommitted
+
Allowed
+
Possible
+
Possible
+
Possible
+
+
+
Read committed
+
Not possible
+
Possible
+
Possible
+
Possible
+
+
+
Repeatable read
+
Not possible
+
Not possible
+
Allowed
+
Possible
+
+
+
Serializable
+
Not possible
+
Not possible
+
Not possible
+
Not possible
+
+
+
+ Transaction isolation levels and the
+ contention phenomena that they allow. See the Postgres
+ docs if you want to learn more.
+
+
+Concurrent resource access is a problem that every real
+world web application is going to have to deal with. So
+without isolation, how do you deal with the problem?
+
+### Slow, bad, and buggy custom locking schemes (#bad-locking)
+
+The most common technique is to implement your own
+pessimistic locking system that constrains access to some
+set of resources to a single operation, and forces others to
+block until it's finished. So for example, if our core
+model is a set of user accounts that own other resources,
+we'd lock the whole account when a modification request
+comes in, and only unlock it again after we've finished our
+work.
+
+!fig src="/assets/acid/pessimistic-locking.svg" caption="Demonstration of pessimistic locking showing 3 requests to the same resource. Each blocks the next in line."
+
+This approach is _all_ downsides:
+
+1. ***It's slow.*** Operations waiting on a lock may have
+ to wait for very extended periods for resources to
+ become available. The more concurrent access, the worse
+ it is (which probably means that your large users will
+ suffer the most).
+
+2. ***It's inefficient.*** Not every blocking operation
+ actually needs to wait on every other operation. Because
+ the models you lock on tend to be broad to reduce the
+ system's complexity, many operations will block when
+ they didn't necessarily have to.
+
+3. ***It's a lot of work.*** A basic locking system isn't
+ too hard to implement, but if you want to improve its
+ speed or efficiency then you quickly need to move to
+ something more elaborate which gets complicated fast.
+ With an ACID database, you'll get a fast, efficient, and
+ correct locking system built-in for free.
+
+3. ***It's probably not right.*** Locks and software are
+ hard. Implementing your own system _is_ going to yield
+ problems; it's just a question of what magnitude.
+
+## Durability (#durability)
+
+The "D" in ACID. It dictates that committed transactions
+_stay_ committed, even in the event of a crash or power
+loss. It's so important that even data stores that don't
+support the rest of ACI* tend to get it right. I wrote a
+separate article about [MongoDB's lengthy road to achieving
+durability][mongo-durability] for example.
+
+## Optimizing for saved seconds on a decade scale (#optimizing)
+
+An often cited features document data stores is that they
+allow you to bootstrap quickly and get to a prototype
+because they don't bog you down with schema design. Rich
+Hickey has a great talk where he makes [a distinction
+between "simple" and "easy"][simple-made-easy], with
+***simplicity*** meaning the opposite of complex, and
+***ease*** meaning "to be at hand" or "to be approachable"
+in that it may provide short term gratification, even if
+it's to the detriment of long term maintainability.
+Schemaless databases are not simple; they're easy.
+
+First of all, the claim around faster prototyping isn't
+actually true -- an experienced developer reasonably
+competent with their RDMS of choice and armed with an ORM
+and migration framework can keep up with their document
+store-oriented counterpart (and almost certainly outpace
+them), but even if it were true, it's optimizing for
+exactly the wrong thing.
+
+While building a prototype quickly might be important for
+the first two weeks of a system's lifespan, the next ten
+years will be about keeping it running correctly by
+minimizing bugs and data consistency problems that will
+lead to user and operator pain and attrition. Life is
+artificially difficult when your `User` records aren't even
+guaranteed to come with an `id` or `email` field, and even
+steadfast schemaless enthusiasts will acquiesce to allow
+some form of constraints.
+
+By the time an organization hits hundreds of models and
+thousands of fields, they'll certainly be using some kind
+of object modeling framework in a desperate attempt to get
+a few assurances around data shape into place.
+Unfortunately by that point, things are probably already
+inconsistent enough that it'll make migrations difficult in
+perpetuity, and application code twisted and complicated as
+it's built to safely handle hundreds of accumulated edge
+cases.
+
+For services that run in production, the better defined the
+schema and the more self-consistent the data, the easier
+life is going to be. Valuing miniscule short-term gains
+over long-term sustainability is a pathological way of
+doing anything; when building production-grade software,
+it's a sin.
+
+## Not "webscale" (#scaling)
+
+A common criticism of ACID databases is that they don't
+scale, and by extension horizontally scalable (and usually
+non-ACID) data stores are the only valid choice.
+
+First of all, despite unbounded optimism for growth, the
+vast majority will be well-served by a single vertically
+scalable node; probably forever. By offloading infrequently
+needed "junk" to scalable alternate stores and archiving
+old data, it's reasonable to expect to vertically scale a
+service for a very long time, even if it has somewhere on
+the order of millions of users. Show me any databases
+that's on the scale of TBs or larger, and I'll show you the
+100s of GBs that are in there when they don't need to be.
+
+After reaching scale on the order of Google's, there's an
+argument to be made for giving up aspects of ACID in return
+for certain kinds of partitioning tolerance and guaranteed
+availability, but advances in newer database technologies
+that support some ACID along with scaling mean that you
+don't have to go straight to building on top of a glorified
+key/value store anymore. For example, Citus gives you
+per-shard ACID guarantees. Google Spanner provides
+distributed locking read-write transactions for when you
+need them.
+
+!fig src="/assets/acid/foundation.jpg" caption="For best results, build your app on solid foundations."
+
+## Check your foundation (#foundation)
+
+There's a common theme to everything listed above:
+
+* You can get away ***without atomicity***, but you end up
+ hacking around it with cleanup scripts and lots of
+ expensive engineer-hours.
+
+* You can get away ***without consistency***, but only
+ through the use of elaborate application-level schemes.
+
+* You can get away ***without isolation***, but only by
+ building your own probably slow, probably inefficient,
+ and probably buggy locking scheme.
+
+* You can get away ***without constraints*** and schemas,
+ but only by internalizing a nihilistic understanding that
+ your production data isn't cohesive.
+
+By choosing a non-ACID data store, you end up
+reimplementing everything that it does for you in the user
+space of your application, except worse.
+
+Your database can and should act as a foundational
+substrate that offers your application profound leverage
+for fast and correct operation. Not only does it provide
+these excellent features, but it provides them in a way
+that's been battle-tested and empirically vetted by
+millions of hours of running some of the heaviest
+workloads in the world.
+
+My usual advice along these lines is that there's no reason
+not to start your projects with an RDMS providing ACID and
+good features around constraints. In almost every case the
+right answer is probably to just use Postgres.
+
+[mongo-durability]: /fragments/mongo-durability
+[simple-made-easy]: https://www.infoq.com/presentations/Simple-Made-Easy
diff --git a/content/articles/alerting.md b/content/articles/alerting.md
new file mode 100644
index 000000000..48971570f
--- /dev/null
+++ b/content/articles/alerting.md
@@ -0,0 +1,259 @@
+---
+hook: A set of general guidelines to consider when designing a alerts for a production
+ system.
+location: Leipzig (finished in San Francisco)
+published_at: 2015-08-18T11:28:48Z
+title: Designing Alerts
+---
+
+Adding alerts to systems has become a widespread standard practice that helps
+engineers keep their production systems up. The basic idea of such a setup is
+that systems should be designed in such a way that they should be able to
+compensate for common failure cases, but if something happens that's beyond the
+boundaries of the system's ability to handle, it should tell a human about it
+so that they can come in and fix the problem manually.
+
+The on-call model for software was adopted from other industries, the surgeon
+who may get called in to perform an emergency surgery for example. But unlike
+the surgeon and quite a few of our other pager-bearing counterparts, it's often
+possible to achieve a considerable amount of automation in software industry
+with the brunt of failures being handled by internal systems automatically and
+relatively few dropping through the cracks to a human. Receiving a page is of
+course a less-than-desirable outcome because it might lead to someone waking up
+in the middle of the night to fix a problem at work, making automation
+attractive (but not wholly sufficient in itself).
+
+Adopting a system of alerting certainly isn't a problem-free endeavor though;
+like many other areas in technology, there are a plethora of pitfalls to run
+into. Lack of a appropriate discipline while designing them can lead to bad
+weeks on-call and operator fatigue. Specifically, here are a few problems that
+are easy to run into:
+
+* Alarms that page too aggressively; they're responded to and it turns out that
+ nothing is wrong.
+* Alarms that aren't specific enough; they're responded to and significantly
+ more analysis is needed to figure out what's going on.
+* Alarms that need to be passed down the line because they only represent an
+ indirect failure in the observed service.
+* Alarms that go off time and again, without the root problem ever getting
+ fixed.
+
+There's no universal playbook that'll ensure that these kinds of things never
+surface, but following a few general guidelines can help reduce the number of
+them significantly. Here's a few that I've assembled over the years while
+putting alerts into production for various services.
+
+## Guidelines (#guidelines)
+
+### Design for granularity (#granularity)
+
+There's nothing worse than waking up in the middle of the night and discovering
+that an alert has gone off that doesn't have an obvious remediation because it
+could mean that any number of things have gone wrong. This inevitably leads to
+a drawn out investigation that's further slowed by the operator being
+half-asleep.
+
+This one may seem obvious, but there are quite a few types of alerts that seem
+like a good idea until a closer inspection reveals that they're breaking the
+granularity rule. For example, an alert on something like a service's HTTP
+`/health` endpoint is a very widespread pattern, but one where a failure can
+ambiguously mean anything from a thread deadlock to its database going down. A
+much more powerful alternative pattern is to have a background process
+constantly logging fine-grain health information on a wide range of system
+telemetry, and using that to implement alarms that try to detect each type of
+failure condition individually. This will allow an operator to identity the
+root of the failure faster, and execute a quick resolution.
+
+A goal to shoot for here is to make sure that every alarm in your system has a
+1:1 ratio with a possible causation. If receiving an alert could mean that more
+than one thing has gone wrong, then there's probably room to make that alert
+more granular.
+
+### Alert at the root cause (#root-cause)
+
+Alerts should be designed to measure against the metric being emitted by the
+system which is the most directly relevant to them. This is another one that
+seems like obvious advice, but even an alert designed by an advanced user can
+often be found to have a number of unnecessary abstractions layered on top of
+it when scrutinized closely. A system operator's goal here should be to slice
+through these abstractions until only the most basic level is left.
+
+As an example, I previously wrote about how [long-lived transactions degraded
+the performance of our Postgres-based job queue](/postgres-queues). We'd
+originally been alerting based on the number of jobs in our background queue
+because that was the most obvious symptom of the problem. Upon closer study,
+we realized that the job queue only bloated because the time to lock a job was
+increasing, so that lock time became a more obvious candidate for an alert. But
+going in even further, we realized that the reason lock time degraded was
+almost always due to an old transaction somewhere in the system, so we started
+alerting on that. After even more time passed, we noticed lock degradation that
+was unrelated to oldest transaction, so we started alerting on the number of
+dead tuples in the table, which is directly correlated to lock time and an
+early warning for when the system starts degrading for any reason.
+
+### Minimize external services (#external-services)
+
+In all cases except maybe your most mission critical system, it's not worth
+waking up your operators when a third party service goes down that one of your
+components happens to depend on. Keep your alerts inward-facing so that if they
+trigger, there's always meaningful action that can be taken by your team rather
+than just passing that page on to someone else.
+
+By extension, wherever you have any measure of control (with other teams
+internally for example), try to encourage the operators of services that you
+depend on to maintain appropriate visibility into their own stacks. Your goal
+here is certainly to make sure that the system as a whole stays up, but that
+the team receiving the page are the ones with the best ability to influence the
+situation.
+
+A misstep that we made internally is that the component that handled [Heroku
+Dropbox Sync](https://devcenter.heroku.com/articles/dropbox-sync) ended up
+being built on top of a rickety component whose job it was to stream platform
+events and which had a very poor track record for reliability. It was
+ostensibly owned by my own team, and we only had bare bones alerting on it.
+Dutifully though, they put an alarm in place around an end-to-end integration
+test that injected a release into a Heroku app and waited for it to come out of
+the other end. When the audit steamer failed, they got paged, and they
+re-raised those pages to us, resulting in a bad situation for everyone
+involved.
+
+### Safe at rest (#safe-at-rest)
+
+One tempting mistake in a well-trafficked production environment is to build an
+alarm off of the ambient load in the system. For example, given a service
+designed to persist auditing events into a long-term archive we might alert on
+the fact that an event was persisted in the last five minutes. This often won't
+show a problem for a long time, but is undesirable because these kinds of
+alarms can trigger false positives in certain situations like a lull in traffic
+or a system-wide maintenance state, and also map poorly to development
+environments where there is no consistently reliable load.
+
+Whenever possible, design alerts that don't rely on any ongoing traffic at all,
+but if that can't be avoided, then make sure that there's a built-in
+multi-environment mechanism for stimulating it artificially.
+
+### Avoid hypotheticals (#avoid-hypotheticals)
+
+An overly enthusiastic engineering spinning up a new service might fall into
+the trap of guessing where all the alarms on it should be. Well-understood
+failure cases should be planned for and designed against, but some care should
+be taken to not roam too far out into the realms of the hypothetical. If in the
+future these alarms do end up going off, they'll more often than not take an
+operator by surprise and course to resolution unclear.
+
+Stay conservative when it comes to adding new alerts; it's okay to add alerts
+that are expected proactively, but for most others it might be better to wait
+until more concrete information is available. It's always possible to add new
+alerts when new problems occur or unexpected situations are observed.
+
+### Throttle on slowly (#throttle-slowly)
+
+Being on the wrong end of a pager after a new product goes into production
+might lead to a harrowing week. Luckily, no product goes into production
+overnight. Take advantage of the relatively long product lifestyle by putting
+in alerts during the alpha and beta phases that produce a notification that
+somebody will receive eventually (like an e-mail), but which won't page outside
+of business hours. One those warning-style alerts are vetted and stable,
+promote them to production.
+
+### Don't allow flappy alarms to enter the shared consciousness (#flappy-alarms)
+
+As busy engineers, one bad habit that we're particularly susceptible to is
+applying the fastest possible fix to a problem and moving on without
+considering whether there may also be an only incrementally more expensive
+solution that could buy a much more permanent fix. In the case of an alarm,
+this often looks like responding to it and doing some basic investigation to
+make sure that nothing is seriously wrong, but without considering that the
+alarm may be very broken and badly in need of attention. Over time, it's easy
+to become desensitized to these types of flappy alarms to the point where they
+become built into the team's shared consciousness and where no one will
+consider them to any real depth.
+
+Newer employees might be especially susceptible to this problem because as far
+as they're concerned, some alert might have been going off for the entire
+length of their contemporary career. They'll also make implicit assumptions
+that their senior colleagues would have looked into the problem already if
+there was anything that could be done about it.
+
+My advice for these types of situations is (of course) to try to spend a bit of
+time trying to tweak or change the alarm so that it's less prone to produce
+false positives. _However,_ if nothing can easily be done to improve it, it's
+far better to eliminate the alarm completely than leave it in-place in its
+degraded state. Given a bad alarm, responders are already unlikely to be doing
+much of anything useful when it goes off, so it's better to save them the pain
+in the first place.
+
+An example of this that we had was to put an alert on 500 status codes coming
+from backend services when after we had an incident that involved a service
+going down that we would have been easily able to detect. The alert was added,
+but at a level that would trigger based on occasional ambient spikes in backend
+errors, which caused it to go off randomly every day or two. Every time it did,
+an operator would have to go in, find the service that was causing the trouble,
+and compare its current error levels to historical records before deciding how
+to proceed. It didn't take long before operators were ignoring these alarms
+completely, making them noisy and worthless.
+
+### Treat alarms as an evolving system (#evolve)
+
+As an extension to the previous point, it's a good idea to always think about
+your current set of alarms as a evolving system that you can and should be
+constantly tweaking and improving. Obviously this applies to adding new alarms
+as exotic new failure cases are discovered, but even if what already have works
+pretty well, there may still be a more optimal configuration or a different
+alarm that could be put in place that does a better job compared to what's
+already in there.
+
+This also applies in the reverse: try to never get yourself in a position where
+you're cargo culting by keeping alarms around just because they've always been
+there. Even if you're not the original author of a particular component, take
+control of its stack and keep it sane.
+
+### Empower recipients to improve the situation (#empower-recipients)
+
+When I first started working at Heroku, we had a global pager rotation where
+for one day every few weeks, one on-call engineer would respond to any problem
+that occurred across the entire platform. For reasons that are hopefully mostly
+intuitive, this situation was utterly depraved; engineers would wake up,
+acknowledge pages, follow playbooks by rote, and hope against all odds that
+this would be the last page of the night. Everyone had strong incentive to fix
+the problems that were interrupting their sleep and lives half a dozen times a
+day, but for the most part these problems were in foreign codebases where the
+cost to patch them would be astronomically high.
+
+We eventually did away with this catastrophe by moving to team-based pager
+rotations and inventing the "ops week", which generally meant that the on-call
+engineer wasn't working on anything else besides being on-call. This would give
+them a bit of free capacity to go in and address the root problems of any pages
+that had woken them up, thus empowering them to reduce their own level of pain.
+
+### Observe ownership (#ownership)
+
+It may be tempting for an enthusiastic operator to put alarms into place that
+indeed do represent failure cases for a service that they own, but which upon
+closer inspection may have a root cause that lies outside of its area of
+responsibility. As mentioned in the ["Alert at the Root Cause"](#root-cause)
+above, it's important to make sure that the most direct source of a problem is
+traced, and in some cases that source may lie across team boundaries. A poorly
+placed alarm may result in an operator waking up only to pass the page onto
+another team who can address the problem more directly. Given this situation,
+it's far better to have that other team wake up first; hopefully they can
+address the problem before it bubbles up to everyone else.
+
+Once again, this one might seem obvious, but there are a number of situations
+where this problem is easy to encounter. For example, given a situation where
+an alarm belongs in the realm of a mostly non-technical team without a good
+operational situation, it might be easier to keep the alarm on your side rather
+than fire up the various bureaucratic mechanisms that would get them to analyze
+the situation and eventually take ownership. But even if easier in the short
+term, it's likely to cause trouble over time in that your team is unlikely to
+ever be able to address the underlying problem (see ["Empower Recipients to
+Improve the Situation"](#empower-recipients) above).
+
+## Summary (#summary)
+
+There's a common theme to all the guidelines mentioned above: most of them are
+intuitive at first sight, but can still represent dangers for even experienced
+teams. A successful process tries to use whatever guidelines are helpful to put
+together an initial set of alarms for a service, and then makes sure to iterate
+until that set has been optimized to maximize uptime and speed to resolution,
+all the while decreasing operator pain.
diff --git a/content/articles/antipatterns.md b/content/articles/antipatterns.md
new file mode 100644
index 000000000..8398695d3
--- /dev/null
+++ b/content/articles/antipatterns.md
@@ -0,0 +1,60 @@
+---
+hook: When the use of an anti-pattern is considered beneficial.
+location: San Francisco
+published_at: 2014-02-06T15:31:37Z
+title: Healthy Anti-patterns
+---
+
+In [Tracing Request IDs](/request-ids), I briefly talked about the possibility of making a request ID easily available from anywhere in an app through a pattern called the [_request store_](https://github.com/steveklabnik/request_store). It's a simple construct that stores data into Ruby's thread-local context:
+
+``` ruby
+# request store that keys a hash to the current thread
+module RequestStore
+ def self.store
+ Thread.current[:request_store] ||= {}
+ end
+end
+```
+
+Middleware is then used in an app which makes sure that all context that was added to the store is cleared between requests:
+
+``` ruby
+class Middleware::RequestStore
+ ...
+
+ def call(env)
+ ::RequestStore.store.clear
+ @app.call(env)
+ end
+end
+```
+
+I usually take it a bit further in a larger application, where it's been my habit to extend the original pattern so that we inventory exactly what it's supposed to contain, making it more difficult to accidentally create opaque dependencies by mixing data in randomly:
+
+``` ruby
+module RequestStore
+ def log_context ; store[:log_context] ; end
+ def request_id ; store[:request_id] ; end
+
+ def log_context=(val) ; store[:log_context] = val ; end
+ def request_id =(val) ; store[:request_id] = val ; end
+
+ private
+
+ def self.store
+ Thread.current[:request_store] ||= {}
+ end
+end
+```
+
+## The anti-pattern (#the-antipattern)
+
+Much like the infamous [singleton pattern](http://en.wikipedia.org/wiki/Singleton_pattern), the request store introduces global state into its application, which in turn makes it more difficult to reason about the dependencies of any given piece of code. Global state can have other side effects too, like making testing more difficult; globals that initialize themselves implicitly can be hard to set without a great stubbing framework, and will keep their value across multiple test cases, which is surprising behavior for anyone not expecting it.
+
+This sort of technique is slightly less controversial in the world of dynamic languages (where you often have something like a [GIL](http://en.wikipedia.org/wiki/Global_Interpreter_Lock) to save you from race conditions across threads) , but I think it's safe to say that my highly pattern-oriented colleagues back in the enterprise world would have chastised me for considering the use of global state of any kind. Instead, they'd strongly prefer the use of a dependency injection framework to make certain information accessible from anywhere in an app.
+
+Despite all this, from an engineering perspective the side effects of using the request store over time have been minimal. By staying vigilant in making sure that it doesn't creep beyond its originally intended use, the request store becomes a very convenient way to store a few pieces of global state that would otherwise be very awkward to access. We keep its use in check by coming to consensus on what can get added to it through discussion in pull requests.
+
+The request store isn't an isolated case either. Projects like Rails and Sinatra have been using global patterns in places like [managing their database connection](https://github.com/rails/rails/blob/4-0-stable/activerecord/lib/active_record/core.rb#L86-L88) or [delegating DSL methods from the main module](https://github.com/sinatra/sinatra/blob/184fe58ca5879d04fce82fcb190c10f72e1f63bc/lib/sinatra/base.rb#L1988) for as long as they've existed. These uses may have caused some grief for somebody over the years, but lasting as long as they have is a testament to their success at least on a practical level.
+
+As long as anti-patterns can continue to show positive productivity results and to cause minimal harm, I'll keep using them.
diff --git a/content/articles/api-paradigms.md b/content/articles/api-paradigms.md
new file mode 100644
index 000000000..7be31f033
--- /dev/null
+++ b/content/articles/api-paradigms.md
@@ -0,0 +1,288 @@
+---
+title: Is GraphQL the Next Frontier for Web APIs?
+published_at: 2017-03-29T21:00:36Z
+location: San Francisco
+hook: Musings on the next API technology, and whether REST-ish
+ JSON over HTTP is just "good enough" to never be displaced
+ in a significant way.
+hn_link: https://news.ycombinator.com/item?id=14003134
+---
+
+For a long time the API community spent a lot of effort
+evangelizing [hypermedia][hypermedia], which promised to
+make web APIs more discoverable through clients that could
+follow links like a human, and more future proof by
+re-using existing hypertext and HTTP semantics like ``
+links and 301 redirects.
+
+But hypermedia had a very hard time building any kind of
+traction. At least some of the trouble was technical, but
+its biggest problem wasn't that it wasn't useful, but that
+it wasn't useful _enough_. It may have had some nominal
+advantages over the REST-ish JSON APIs that are seen most
+often today, but they weren't valuable enough to justify
+the extra overhead.
+
+After five years of strong evangelism at every API
+conference in the world and very little actual adoption,
+it's a pretty safe bet that hypermedia isn't going to be
+the next big thing. But that leads us to the question of
+what will be? Does the world even need a new API paradigm?
+REST-ish JSON over HTTP has proven itself to be quite
+practical and resilient to obsolescence; it might just be
+"good enough" to reign supreme for a long time to come.
+
+## DX is paramount (#dx)
+
+As a service provider, it might be tempting to think that
+your choice of backend technology is going to make a big
+difference to your users, but isn't necessarily true. As
+long as you meet a minimum bar of non-offensiveness, this
+is almost never the case (SOAP and OAuth 1 being two
+examples of technologies that don't).
+
+If there are reasonable tools available to keep the
+developer experience (DX) in integrating pretty good, users
+tend to be flexible. Avoid anything that's too heavy,
+exotic, or obnoxious, and you'll find that your users will
+care about the quality of your documentation far more than
+they do about the technology you use.
+
+## REST is okay (#rest)
+
+Roy Fielding's original ideas around REST are elegant and
+now quite widespread, but it's worth considering that the
+paradigm's actual advantages in APIs for developers are
+fairly unremarkable. Practically speaking, REST's strongest
+points are its widespread interoperability (every language
+has an HTTP client) and its **conventions** [1]. URLs are
+usually resources. Resources often have standard CRUD
+operations that are mapped to HTTP verbs like `PATCH` and
+`DELETE`. Status codes usually convey information. Commonly
+needed API mechanics like authentication and encoding are
+sometimes integrated into standard HTTP headers like
+`Authorization` and `Content-Encoding`. This is all very
+good; convention allows a developer to learn something once
+and then re-use that information to figure out how other
+things will probably work.
+
+
Open source; HA through Amazon RDS, Heroku
+ Postgres, or Azure Database
+
+
+
+ Feature matrix and notes of cloud
+ databases.
+
+
+Here's the meaning of each column:
+
+* ***Concurrent ACID:*** Whether the database supports ACID
+ (atomicity, consistency, isolation, and durability)
+ guarantees across multiple operations. ACID is a
+ [powerful tool for system correctness](/acid), and until
+ recently has been a long sought but illusive chimera for
+ distributed databases. I use the term "concurrent ACID"
+ because technically Cosmos guarantees ACID, but only
+ within the confines of a single operation.
+
+* ***HA:*** Whether the database is highly available (HA).
+ I've marked every one on the list as HA, but some are
+ "more HA" than others with CockroachDB, Cosmos, and
+ Spanner leading the way in this respect. The others tend
+ to rely on a single node failovers.
+
+* ***Horizontally Scalable:*** Whether the database can be
+ scaled horizontally out to additional nodes. Everything
+ on the list except Postgres is, but I've included the
+ column to call out the fact that unlike the others,
+ Aurora's scalability is disk only. That's not to say that
+ it's unsuitable for use, but it has some caveats (see
+ ["Amazon Aurora"](#aurora) below for details).
+
+* ***Automatic Scalability:*** Distinguishes databases
+ where data partitioning and balancing is handled manually
+ by the user versus automatically by the database. As an
+ example of a "manual" database, in Citus or MongoDB you
+ explicitly tell the database that you want a table to be
+ distributed and tell it what key should be used for
+ sharding (e.g. `user_id`). For comparison, Spanner
+ automatically figures out how to distribute any data
+ stored to it to the nodes it has available, and
+ rebalances as necessary. Both options are workable, but
+ manual distribution has more operational overhead and
+ without care, can lead to unbalanced sharding where
+ larger nodes run disproportionately hot.
+
+* ***Low latency:*** The extra inter-node coordination
+ overhead used by CockroachDB Cosmos, and Spanner to
+ ensure consistency comes at the cost of being unsuitable
+ where very low latency operations are needed (~1 ms). I
+ cover this in a little more detail below in ["Time-based
+ consistency"](#time-consistency).
+
+## Additional considerations (#additional-considerations)
+
+### CAP (#cap)
+
+The CAP theorem dictates that given _consistency_, _100%
+availability_, and _partition tolerance_, any given
+database can satisfy a maximum of two of the three.
+
+To explain why I didn't include CAP in the table above,
+I'll quote [Eric Brewer (Google VP Infrastructure) writing
+about Spanner][spanner-truetime]:
+
+> Despite being a global distributed system, Spanner claims
+> to be consistent and highly available, which implies
+> there are no partitions and thus many are skeptical. Does
+> this mean that Spanner is a CA system as defined by CAP?
+> The short answer is “no” technically, but “yes” in effect
+> and its users can and do assume CA.
+>
+> The purist answer is “no” because partitions can happen
+> and in fact have happened at Google, and during (some)
+> partitions, Spanner chooses C and forfeits A. It is
+> technically a CP system. We explore the impact of
+> partitions below.
+>
+> Given that Spanner always provides consistency, the real
+> question for a claim of CA is whether or not Spanner’s
+> serious users assume its availability. If its actual
+> availability is so high that users can ignore outages,
+> then Spanner can justify an “effectively CA” claim. This
+> does not imply 100% availability (and Spanner does not
+> and will not provide it), but rather something like 5 or
+> more “9s” (1 failure in 10^5 or less). In turn, the real
+> litmus test is whether or not users (that want their own
+> service to be highly available) write the code to handle
+> outage exceptions: if they haven’t written that code,
+> then they are assuming high availability. Based on a
+> large number of internal users of Spanner, we know that
+> they assume Spanner is highly available.
+
+In other words, modern techniques can achieve CP while
+still keeping availability that's incredibly good. Like
+_five or more 9s of good_. This result is so optimal that
+modern databases seem to be converging on it. Every
+database on the list above is _CP_ with varying levels of
+_A_ (with some caveats [1]).
+
+### Time-based consistency (#time-consistency)
+
+Sophisticated distributed systems like Spanner and
+CockroachDB tend to need a little more time to coordinate
+and verify the accuracy of the results that will be
+returned from any given node, and this makes them less
+suitable for low latency operations.
+
+Quizlet suggests that the minimum latency for a Spanner
+operation [is ~5 ms][spanner-quizlet]. The [Spanner
+paper][spanner-paper] describes the details of the
+coordination for various operations in sections 4.1. and
+4.2. CockroachDB states very explicitly in their FAQ that
+[it's not as good of a choice where low latency reads and
+writes are critical][cockroach-not-good-choice].
+
+The design of Microsoft's Cosmos isn't as transparent, but
+its documentation seems to suggest similar performance
+characteristics with [the median time for reads and writes
+at 5 ms][cosmos-99th].
+
+## The contenders (#contenders)
+
+### Amazon Aurora (#aurora)
+
+[Aurora][aurora] is a managed relational database that has
+an SQL interface that's compatible with MySQL and Postgres.
+One of its biggest selling points is performance, and it
+claims to provide 5x the throughput of MySQL and 2x of
+Postgres running on the same hardware.
+
+Aurora is quite distinctive from any other option on this
+list because it's not horizontally scalable at the node
+level, and its clusters more resemble those of a
+traditional RDMS with a primary and read replicas. Instead,
+Amazon has devised a storage-level scaling scheme that
+allows its tables to grow to sizes significantly larger
+than you'd see with a traditional RDMS; up to 64 TB per
+table.
+
+This storage-based scaling has the disadvantage that
+compute and memory resources (for writes or consistent
+reads) are limited to a single vertically scaled node
+[2], but it also has significant advantages as well: data
+is always colocated so query latency is very low. It also
+means that you can't make a mistake choosing a partition
+scheme and end up with a few hot shards that need to be
+rebalanced (which is _very_ easy to do and _very_ hard to
+fix). It may be a more appropriate choice than solutions
+like CockroachDB or Spanner for users looking for extensive
+scalability, but who don't need it to be infinite.
+
+### Citus (#citusdb)
+
+[Citus][citus] is a distributed database built on top of
+Postgres that allows individual tables to be sharded and
+distributed across any number of nodes. It provides clever
+concepts like _reference tables_ to help ensure data
+locality to improve query performance. ACID guarantees are
+scoped to particular nodes, which is often adequate given
+that partitioning is designed so that data is colocated.
+
+Most notably, Citus is open source and runs using the
+Postgres extension API. This reduces the risk of lock in,
+which is a considerable downside of most of the other
+options on this list. Compared to Aurora, it also means
+that you're more likely to see features from new Postgres
+releases make it into your database.
+
+A downside compared to CockroachDB and Spanner is that it
+data is sharded manually, which as noted above, can lead to
+balancing problems. Another consideration is that it's
+built by a fairly new company with a yet unproven business
+model. Generally when selecting a database, it's good for
+peace of mind to know that you're using something that's
+almost certainly going to be around and well-maintained in
+ten years time. You can be pretty confident of that when
+the product is made by a behemoth like Amazon, Google, or
+Microsoft, but less so for smaller companies.
+
+### CockroachDB (#cockroachdb)
+
+[CockroachDB][cockroach] is a product built out of
+Cockroach Labs, a company founded by ex-Googlers who are
+known to have been influencial in building Google File
+System and Google Reader. It's based on the design laid out
+by the original Spanner paper, and like spanner, uses a
+time-based mechanic to achieve consistency, but without the
+benefit of Google's GPS and atomic clocks.
+
+It provides serializable distributed transactions, foreign
+keys, and secondary indexes. It's open source and written
+in Go which gives it the nice property of being easily
+installable and runnable in a development environment.
+Their documentation is refreshingly well-written, easily
+readable, and honest. Take for example their [list of known
+limitations][cockroach-limitations].
+
+Like Spanner, the additional overhead of guaranteeing
+distributed consistency means that it's a poor choice where
+low latency operations are needed ([they admit as much
+themselves][cockroach-not-good-choice]). Like Citus above,
+the fact that it's built by a small company with an
+unproven business model is a downside.
+
+### Microsoft Cosmos (#cosmos)
+
+[Cosmos][cosmos] is Microsoft's brand-new cloud database.
+Its sales pitch tends to come on a little strong. For
+example, here's an excerpt where they sell schemaless
+design, which put most generously is a well known trade
+off, and less so, an anti-feature:
+
+> Both relational and NoSQL databases force you to deal
+> with schema & index management, versioning and migration
+> [...] But don’t worry -- Cosmos DB makes this problem go
+> away!
+
+That said, it's got a pretty good set of features:
+
+* Fast and easy geographical distribution.
+* A configurable consistency model that allows anything
+ from strong serializability all the way down to eventual
+ consistency which trades the possibility of out-of-order
+ reads for speed.
+* SLAs on operation timing that guarantees reads under 10
+ ms and indexed writes under 15 ms at the 99th percentile.
+
+Like with CockroachDB and Spanner, the distribution of
+Cosmos makes it less suitable for work requiring very low
+latency operations. Their documentation suggests a median
+read and write latency of ~5 ms.
+
+Cosmos is able to provide ACID through [the use of stored
+procedures in JavaScript][cosmos-acid]. This seems to be by
+virtue of having only one JavaScript runtime running on the
+primary so that only one script is being handled at a time,
+but it's also doing some bookkeeping the ensure that any
+writes can be reverted, thereby ensuring true atomicity
+(unlike say `EVAL` in Redis). Still, this approach is not
+as sophisticated as the MVCC engines used by other
+databases on this list because it can't provide concurrent
+use.
+
+### MongoDB (#mongodb)
+
+MongoDB is a NoSQL data store that stores data as
+schemaless JSON documents. It doesn't support ACID
+transactions, and if that wasn't enough, since its release
+in 2009 has had a number of valid criticisms around core
+database competencies like
+[durability](/fragments/mongo-durability),
+[security][mongo-security], and
+[correctness][mongo-correctness].
+
+I've included it for purposes of comparison and because it
+still seems to have a lot of mindshare, but it's not at the
+same level of sophistication as other systems on this list.
+Most others have a strict superset of its functionality
+(albeit with trade offs in a few cases), but also support
+other critically important features like ACID guarantees.
+New projects shouldn't start on MongoDB, and old projects
+should be thinking about migrating off of it.
+
+### Postgres (#postgres)
+
+[Postgres][postgres] is the trusty workhorse of traditional
+RDMSes. HA isn't built in, but is available through
+offerings from Amazon RDS, Heroku Postgres, or Azure
+Database (and hopefully Google Cloud SQL soon).
+
+Even though it's not a perfect fit for the rest of this
+list, I've included it anyway because it's often still the
+best option for many use cases. Most organizations don't
+have data that's as big as they think it is, and by
+consciously restricting bloat, they can get away with a
+vertically scaled Postgres node. This will lead to a more
+operable stack, and more options in case it's ever
+necessary to migrate between clouds and providers. You can
+also easily run Postgres locally or in testing, which is
+very important for friction-free productivity.
+
+## Closing thoughts (#closing-thoughts)
+
+Opinion time: the best choice for most people will be to
+start with Postgres. It's a battle-tested database with a
+spectacular number of features and few limitations. It's
+open source and widely available so it can easily be run in
+development, CI, or migrated across every major cloud
+provider. Vertical scaling will go a long way for
+organizations [who curate their data and offload lower
+fidelity information to more scalable
+stores](/acid#scaling).
+
+After you're at the scale of AirBnB or Uber, something like
+Aurora should look interesting. It seems to have many of
+the advantages of Postgres, and yet still manages to
+maintain data locality and scalable storage (at the costs
+of loss of dev/production parity and vendor lock in).
+Organizations at this tier who run hot and need compute and
+memory resources that are scalable beyond a single node
+might benefit from something like Citus instead.
+
+After you're at the scale of Google, something closer to
+Spanner is probably the right answer. Although less
+suitable for low latency operations, its scalability
+appears to be practically limitless.
+
+The only databases on the list that I've seen running in
+production are MongoDB and Postgres, so take these
+recommendations with a grain of salt. There's almost
+certainly hidden caveats to any of them that will only be
+uncovered with a lot of hands on experience.
+
+[1] The _CAP_ properties of Cosmos and MongoDB are
+configurable as they can both be made to be eventually
+consistent.
+
+[2] Aurora nodes are currently scalable to 32 vCPUs and 244
+GB of memory. Although that is "only" one node, it's
+nothing to scoff at and should provide enough runway for
+the vast majority of use cases.
+
+[aurora]: https://aws.amazon.com/rds/aurora/
+[citus]: https://www.citusdata.com/
+[cockroach]: https://www.cockroachlabs.com/
+[cockroach-limitations]: https://www.cockroachlabs.com/docs/known-limitations.html
+[cockroach-not-good-choice]: https://www.cockroachlabs.com/docs/frequently-asked-questions.html#when-is-cockroachdb-not-a-good-choice
+[cosmos]: https://docs.microsoft.com/en-us/azure/cosmos-db/introduction
+[cosmos-acid]: https://docs.microsoft.com/en-us/azure/documentdb/documentdb-programming#database-program-transactions
+[cosmos-99th]: https://docs.microsoft.com/en-us/azure/cosmos-db/introduction#low-latency-guarantees-at-the-99th-percentile
+[mongo-correctness]: https://blog.meteor.com/mongodb-queries-dont-always-return-all-matching-documents-654b6594a827
+[mongo-security]: https://krebsonsecurity.com/2017/01/extortionists-wipe-thousands-of-databases-victims-who-pay-up-get-stiffed/
+[postgres]: https://www.postgresql.org/
+[spanner-paper]: https://research.google.com/archive/spanner.html
+[spanner-quizlet]: https://quizlet.com/blog/quizlet-cloud-spanner
+[spanner-truetime]: https://research.google.com/pubs/pub45855.html
diff --git a/content/articles/dotfile-gpg.md b/content/articles/dotfile-gpg.md
new file mode 100644
index 000000000..9a10864d9
--- /dev/null
+++ b/content/articles/dotfile-gpg.md
@@ -0,0 +1,86 @@
+---
+hook: Learn how to start encrypting dotfile secrets with GPG, and some techniques
+ for getting those encrypted files integrated with your toolchain.
+location: San Francisco
+published_at: 2014-11-10T23:46:34Z
+title: Dotfile Secrets and GPG
+---
+
+A recent [scare installing Yosemite](/fragments/yosemite-progress), was a nice reminder to check the state of affairs of my backups. I do a decent job for most of my important data, but one weak point were my dot files, which I'd traditionally spread around as little as possible because the standard usage of so many programs had led me to storing a lot of plain text passwords and keys spread out all over the place.
+
+This put me on the path toward the complete eradication of any secrets on disk in my dotfiles with the eventual goal of being able to back them up to the cloud, and feel comfortable about doing so. As it turns out, this is a more difficult prospect than you might imagine: although we have a pretty obvious tool to help with this in [the form of GnuPG](https://wiki.archlinux.org/index.php/GnuPG), integration across the system is spotty and varies widely in terms of design and setup difficulty. For better or for worse most Unix programs are best adapted for reading secrets out of plain text files, and as part of getting a complete GPG setup you'll have to fight this natural order of things.
+
+In terms of basic GPG setup, I won't even try to one up [Thoughtbot's excellent article on the subject](http://robots.thoughtbot.com/pgp-and-you) which provides a comprehensive walkthrough on getting off the ground with GPG, including generating your key. As a prerequisite to doing anything in this article, you should give it a thorough read. After that's done, if you're still ready to use GPG to secure your secrets, you can turn to this article to help assemble a gpg-agent configuration and start integrating the suite with the rest of your tooling.
+
+## gpg-agent (#gpg-agent)
+
+gpg-agent's job is to remember your passphrase for some amount of time so that you don't have to compromise between encrypted secrets and your sanity. This would otherwise be a problem as you were forced to enter a passphrase three times a minute.
+
+gpg-agent is a little more sophisticated than its cousin ssh-agent in that it can write out metadata about its process to a file, but like ssh-agent it still needs a little extra infrastructure built into your *rc files. The [Archlinux wiki page](https://wiki.archlinux.org/index.php/GnuPG#gpg-agent) on the subject contains a nice little bootstrap script that will launch a gpg-agent if necessary, but will otherwise export the settings of the copy that's already running:
+
+``` sh
+#!/bin/sh
+
+# start-gpg-agent
+
+gnupginf="$HOME/.gpg-agent-info"
+gnupglog="$HOME/.gpg-agent.log"
+
+if pgrep -x -u "$USER" gpg-agent >/dev/null 2>&1; then
+ eval `cat $gnupginf`
+ eval `cut -d= -f1 $gnupginf | xargs echo export`
+else
+ eval `gpg-agent -s --daemon --write-env-file "$gnupginf" \
+ --log-file "$gnupglog"`
+fi
+```
+
+This should then be included in your appropriate *rc file so that `gpg` can always find a running agent (note the leading `.` which is used to source the script):
+
+```
+. start-gpg-agent
+```
+
+Now when opening a new shell, you should be able to see that your agent is alive and well:
+
+```
+$ gpg-agent
+gpg-agent: gpg-agent running and available
+```
+
+When you run a GPG command, it will initially ask for your passphrase, but will then cache it for the next time around:
+
+```
+$ echo "encrypt me" | gpg --armor --encrypt -
+-----BEGIN PGP MESSAGE-----
+Version: GnuPG v1.4.13 (Darwin)
+
+hQEMA1XJl0SO//WLAQf/QsLhIqOSgfKtA3EwiIw290aNhpa1gl6rLXXPw3N66zuH
+...
+
+#
+# CACHED! A passphrase is no longer necessary ...
+#
+
+$ echo "encrypt me" | gpg --armor --encrypt -
+-----BEGIN PGP MESSAGE-----
+Version: GnuPG v1.4.13 (Darwin)
+
+hQEMA1XJl0SO//WLAQf/eBsnpMMoTZIBYEboXmdZcs73EaKD/HDcglQM9k7wyvt3
+...
+```
+
+With the right configuration, gpg-agent can also double as ssh-agent by duplicating its public interface, allowing it to perform the jobs of both agents. I personally don't use this option because gpg-agent somewhat invasively insists on managing your keys itself by assigning them its own passphrases, but it might be a feature that's worth checking out.
+
+## Ecosystem (#ecosystem)
+
+Now that GPG is up and running, the next challenge is to get it integrated into your toolchain so that you can purge any secrets that are stored in plain texg. This is where things start to get a little more challenging because there really isn't a standardized methodology for getting programs to cooperate with GPG, and as a result, many programs have been written in ways that don't allow it to be plugged in easily. That said, thanks to the strong conventions of the Unix environment, it's amazing how many programs _can_ be backpatched for GPG support using nothing but simple shell primitives.
+
+The most trivial example is probably Curl, which is used to reading from an unencrypted .netrc file, but with a simple stdin pipe trick it [can be made to read from an encrypted .netrc.gpg](/fragments/gpg-curl) instead. Other programs like the Heroku toolbelt integrate with GPG out of the box (in this case by preferring a .netrc.gpg over a .netrc if one is available), but this is rare.
+
+I've converted quite a number of my standard tools now, and will curate the following list of mini-guides for GPG integration as I write them:
+
+* [GPG and Curl](/fragments/gpg-curl)
+* [GPG and the Heroku CLI](/fragments/gpg-heroku)
+* [GPG and HTTP Git](/fragments/gpg-git)
+* [GPG and s3cmd](/fragments/gpg-s3cmd)
diff --git a/content/articles/elegant-apis.md b/content/articles/elegant-apis.md
new file mode 100644
index 000000000..bbaaca444
--- /dev/null
+++ b/content/articles/elegant-apis.md
@@ -0,0 +1,449 @@
+---
+hook: A quick tour through some of the fundamentals of JSON Schema and Hyper-schema,
+ and how they can be applied to building APIs.
+location: Berlin
+published_at: 2014-05-27T13:39:13Z
+title: Elegant APIs with JSON Schema
+---
+
+We've recently [gone on record](https://blog.heroku.com/archives/2014/1/8/json_schema_for_heroku_platform_api) indicating our commitment to using JSON Schema as the format for describing our API's, then even further by [releasing a set of tools](https://blog.heroku.com/archives/2014/5/20/heroku-http-api-toolchain) to improve the process of building and working with schema-based HTTP API's. With the recent rise of great API description formats over the last few years like Swagger, Blueprint, and RAML (among others), I wanted to write a few words on what JSON Schema is, why it's a neat technology, and how it can be applied specifically to building great APIs.
+
+At any time, you can jump into more documentation [over at jsonschema.org](http://json-schema.org/documentation.html), which includes detailed draft specifications for both JSON Schema and JSON Hyper-schema.
+
+## The basic case (#basic-case)
+
+At its essence, JSON Schema is simply a declarative language for validating the format and structure of a JSON object. It allows you to specify a number of special primitives to describe exactly what a valid JSON object will look like, and provides a powerful nesting concept that allows you to extend these primitives to a document of any complexity. This idea hails back to the days of XML, when it was common to see XML documents linking to the [XSD's](http://en.wikipedia.org/wiki/XML_schema) (XML Schema Definition) that should be used to validate them.
+
+Let's start with one of the most basic schemas possible. The following describes a single value inside a JSON object:
+
+```
+{
+ "type": "string"
+}
+```
+
+The value `"foo"` would validate successfully while `123` or `false` would not.
+
+More complex rules can be mixed into the object as well. This will validate that the string matches a particular regex pattern:
+
+```
+{
+ "pattern": "^[a-z][a-z0-9-]{2,30}$",
+ "type": "string"
+}
+```
+
+## Nesting schemas (#nesting)
+
+While the above lets us validate a single value, it's more interesting to validate a complex JSON object. We can build on the above by nesting our single value validation into another schema using the `properties` keyword, which describes the keys that a JSON object might have, and the schema that validates their values:
+
+```
+{
+ "properties": {
+ "name": {
+ "pattern": "^[a-z][a-z0-9-]{2,30}$",
+ "type": "string"
+ }
+ },
+ "required": ["name"],
+ "type": "object"
+}
+```
+
+The `required` keyword indicates that the property `name` is expected, so while the object `{"name":"foo"}` is valid, `{}` is not.
+
+Note how the `type` keyword is present in both of the objects in our schema above. This is where the elegance of JSON Schema starts to emerge: **both objects are JSON Schemas that are defined to precisely the same specification**. We could give the `name` object its own `definitions`, but that would be non-sensical because it's defined as a `string` rather than an `object`.
+
+A very common convention in cases like this is to define subschemas under `definitions` and reference them from elsewhere, which allows those schema definitions to be re-used. Like `properties`, `definitions` also maps object keys to schemas, but doesn't suggest that those keys should actually be properties on an object being validated; it's simply a useful mechanism for defining schemas in a common place. The above could be re-written to use `definitions` like so:
+
+```
+{
+ "definitions": {
+ "name": {
+ "pattern": "^[a-z][a-z0-9-]{2,30}$",
+ "type": "string"
+ }
+ },
+ "properties": {
+ "name": {
+ "$ref": "#/definitions/name"
+ }
+ },
+ "required": ["name"],
+ "type": "object"
+}
+```
+
+The strange `$ref` keyword is a [JSON Reference](http://tools.ietf.org/html/draft-pbryan-zyp-json-ref-03). It tells schema parsers that the definition is not a schema itself, but rather references a schema elsewhere in the document (or in a different document). The `#` denotes the root of the JSON document, and the slashes are keys that should be descended through until the appropriate value is reached.
+
+## We need to go deeper (#multi-resource)
+
+Let's think of our schema above as the definition of a simple app, which has a name, but might later have some other properties as well. A very common scenario (especially in an API) might be to define another type of object as well, and to have these objects reference each other.
+
+Along with our app, let's define a domain:
+
+```
+{
+ "definitions": {
+ "name": {
+ "format": "hostname",
+ "type": "string"
+ }
+ },
+ "properties": {
+ "name": {
+ "$ref": "#/definitions/name"
+ }
+ },
+ "required": ["name"],
+ "type": "object"
+}
+```
+
+Domain looks a lot like an app, with its own `name` and property definitions. Note above that we've defined that domain's `name` is in the `hostname` format, which is a special string validation built into JSON Schema.
+
+Now, remember how I told you that schemas nest? They do, and we've already seen how they can be nested one level deep above. To make this even better though, we can actually nest them to _any_ level. Let's put app and domain into the same root schema which will eventually be used to define our entire API. Note how the references below change to reflect the greater depth of nesting.
+
+```
+{
+ "definitions": {
+ "app": {
+ "definitions": {
+ "domains": {
+ "items": {
+ "$ref": "#/definitions/domain"
+ },
+ "type": "array"
+ },
+ "name": {
+ "pattern": "^[a-z][a-z0-9-]{2,30}$",
+ "type": "string"
+ }
+ },
+ "properties": {
+ "domains": {
+ "$ref": "#/definitions/app/definitions/domains"
+ },
+ "name": {
+ "$ref": "#/definitions/app/definitions/name"
+ }
+ },
+ "required": ["name"],
+ "type": "object"
+ },
+ "domain": {
+ "definitions": {
+ "name": {
+ "format": "hostname",
+ "type": "string"
+ }
+ },
+ "properties": {
+ "name": {
+ "$ref": "#/definitions/domain/definitions/name"
+ }
+ },
+ "required": ["name"],
+ "type": "object"
+ }
+ },
+ "properties": {
+ "app": {
+ "$ref": "#/definitions/app"
+ },
+ "domain": {
+ "$ref": "#/definitions/domain"
+ }
+ },
+ "type": "object"
+}
+```
+
+Phew! We've managed to build out a pretty significant schema already. Astute readers may have noticed that along with the new domains resource, we've defined a new property for app:
+
+```
+"domains": {
+ "items": {
+ "$ref": "#/definitions/domain",
+ },
+ "type": "array"
+}
+```
+
+`items` is another special keyword that applies specifically to the `array` type. It indicates that all items in the array should conform to the referenced schema; in this case, that means that `domains` should be an array of objects that validate according to the `domain` schema. For example, this array validates correctly:
+
+```
+[
+ { "name": "example.com" },
+ { "name": "heroku.com" }
+]
+```
+
+We've now demonstrated not only how schemas can be nested to as many levels as we need, but also how subschemas can start to reference each other to do build out more complex validation rules in a modular way.
+
+Once again, discerning readers may have noticed that our top-level schema actually defines a non-sensical object that has both an app and a domain like `{"app":..., "domain":...}`. This is true, but we'll see that it's not important as we move onto building an API in the next section.
+
+## Adding hyper-schema (#hyper-schema)
+
+Along with the JSON Schema, a companion draft also defines JSON Hyper-schema, which builds off the original specification to define a schema that can host a collection of links. This allows us to move beyond the realm of basic JSON validation, and into the more interesting area of using schema to build APIs.
+
+Let's define two simple links on our app schema for creating a new app (`POST /apps`) and listing existing ones (`GET /apps`):
+
+```
+{
+ "definitions": ...,
+ "links": [
+ {
+ "description": "Create a new app.",
+ "href": "/apps",
+ "method": "POST",
+ "rel": "create",
+ "title": "Create"
+ },
+ {
+ "description": "List apps.",
+ "href": "/apps",
+ "method": "GET",
+ "rel": "instances",
+ "title": "List"
+ }
+ ],
+ "properties": ...,
+ "required": ["name"],
+ "type": "object"
+}
+```
+
+Notice how these define individual HTTP endpoints: an access verb is specified in `method`, along with a URI in `href`. We've also tagged each link with some other metadata that tells us a little more about what it does and how we should describe it; this can be supremely useful for tasks like generating code and documentation.
+
+### Request schemas (#request-schemas)
+
+The links above are useful in that we now know a little bit about how to interact with an apps resource, but they don't tell us much beyond that. For example, how do we know what parameters to send in while creating an app?
+
+Luckily, hyper-schema also allows us to nest schemas to describe just that. Let's leverage references once again, and define a create app link that requires a valid app object to be sent in along with a request:
+
+```
+{
+ "description": "Create a new app.",
+ "href": "/apps",
+ "method": "POST",
+ "rel": "create",
+ "schema": {
+ "$ref": "#/definitions/app"
+ },
+ "title": "Create"
+}
+```
+
+Note that although the above is fine in this trivial case, we often want to define required request parameters to be a subset of what we might see in a fully valid object. Because we're defining a schema like any other, we can de-construct it and reference particular properties that we want to see in the incoming request:
+
+```
+{
+ "description": "Create a new app.",
+ "href": "/apps",
+ "method": "POST",
+ "rel": "create",
+ "schema": {
+ "properties": {
+ "name": {
+ "$ref": "#/definitions/app/definitions/name"
+ }
+ },
+ "required": ["name"],
+ "type": "object"
+ },
+ "title": "Create"
+}
+```
+
+A request to an API implementing this schema might look like the following:
+
+```
+curl -X POST http://example.com/apps \
+ -H "Content-Type: application/json" \
+ -d '{"name":"my-app"}'
+```
+
+We could also remove the name requirement (`"required": ["name"]`) if we wanted to generate a name for the new app unless the user explicitly overrides it. In that case, an empty JSON object `{}` would be a valid request for this endpoint.
+
+Once again, I'd like to draw your attention to the elegant modularity of JSON Schema here. We've defined a property on our app object (`name`) one time, then referenced it to describe what a valid app looks like, then used the same technique to reference it again to describe a valid request.
+
+A declarative definition of incoming requests can be supremely useful for sanitizing data and generating errors for malformed data automatically. A tool like [Committee](https://github.com/interagent/committee), which provides a collection of schema-related middleware, can help with this in Ruby.
+
+Note that the API I'm building above is a little like the Heroku API in that it expects input as `application/json` rather than the more commonly seen `application/x-www-form-urlencoded` (e.g. `name=my-app&foo=bar`). Hyper-schema doesn't necessarily stipulate that incoming requests have to be JSON, in fact it defines an `encType` that allows a link to specify its format, but the symmetry of a request and response that are both in JSON is a clean model worthy of consideration (in my humble opinion).
+
+### Response schemas (#response-schemas)
+
+Much like the incoming request, Hyper-schema allows us to specify a schema for the outgoing response as well with the `targetSchema` keyword. Within the confines of our simple example API above, this one is easy; given a request to create an app, let's respond with an app:
+
+```
+{
+ "description": "Create a new app.",
+ "href": "/apps",
+ "method": "POST",
+ "rel": "create",
+ "targetSchema": {
+ "$ref": "#/definitions/app"
+ },
+ "title": "Create"
+}
+```
+
+For the list endpoint, we'd like to describe the response as an array of apps:
+
+```
+{
+ "description": "List apps.",
+ "href": "/apps",
+ "method": "GET",
+ "rel": "instances",
+ "targetSchema": {
+ "items": {
+ "$ref": "#/definitions/app"
+ },
+ "type": "array"
+ },
+ "title": "List"
+}
+```
+
+And we've managed to re-use our basic object definitions yet again! Knowing what responses are supposed to look like can be very handy for testing for API regressions in acceptance-level tests. Once again, [Committee](https://github.com/interagent/committee) can help with that in Ruby by providing test helpers for use with rack-test.
+
+## Let's get meta (#meta-schemas)
+
+An interesting set of products that both JSON Schema and Hyper-schema provide are their own [meta-schemas](http://json-schema.org/documentation.html). Because a schema is itself just a JSON document, a schema can be written for a schema! For example, take a look at the [JSON Hyper-schema meta-schema](http://json-schema.org/hyper-schema). Note how the special `$schema` keyword points back to its own `id`. This schema can be used to validate the format of your own Hyper-schema with a tool like [`json_schema`](https://github.com/brandur/json_schema):
+
+```
+validate-schema --detect my-schema.json
+```
+
+As we all know, convention can be a very challenging problem, especially when working within a larger team of people who all have their own ideas of what a good API looks like. One possible solution to this problem is to start defining convention declaratively by writing a meta-schema that enforces a layer of constraints on top of what's already dictated by the schema and hyper-schema specifications themselves.
+
+For example, a hyper-schema only dictates that a link specifies the `href` and `rel` attributes. We could require that a few more keys are present as well:
+
+```
+{
+ "$schema": "http://example.com/my-hyper-schema",
+ "definitions": {
+ "resource": {
+ "properties": {
+ "links": {
+ "items": {
+ "$ref": "#/definitions/link"
+ },
+ "type": "array"
+ }
+ }
+ },
+ "link": {
+ "required": [ "href", "method", "rel", "targetSchema" ],
+ "type": "object"
+ }
+ },
+ "id": "http://example.com/my-hyper-schema#",
+ "title": "My JSON Hyper-Schema Variant",
+ "properties": {
+ "definitions": {
+ "additionalProperties": {
+ "$ref": "#/definitions/resource"
+ }
+ }
+ }
+}
+```
+
+It may be necessary to read some documentation to understand all the specific keywords in use here, but in essence what we're declaring here is that everything under `definitions` in our hyper-schema is an API resource (see `resource` under `definitions`), and that those resources may have links (`link` under `definitions`). Those links should have the properties `href`, `method`, `rel`, and `targetSchema`.
+
+Checking the validity of our schema above with `validate-schema` from the [json_schema](https://github.com/brandur/json_schema), we get this:
+
+```
+validate-schema -d -s meta.json schema.json
+schema.json is valid.
+```
+
+But if we leave `targetSchema` out of our first link, we get this instead:
+
+```
+validate-schema -d -s meta.json schema.json
+schema.json#/definitions/app/links/0: failed schema #/definitions/resource/properties/links/items: Missing required keys "targetSchema" in object; keys are "description, href, method, rel, schema, title".
+```
+
+We could also mandate that all resource property names should be lowercase only with underscores allowed:
+
+```
+"resource": {
+ "properties": {
+ ...,
+ "properties": {
+ "additionalProperties": false,
+ "patternProperties": {
+ "^[a-z][a-z_]+[a-z]$": {}
+ }
+ }
+ }
+},
+```
+
+Note that the `patternProperties` keyword allows us to match on a schema based on the name of a property in an object, and `additionalProperties` set to `false` dictates that properties that are not in the `properties` object or defined in `patternProperties` are not valid. Re-running again we see that all the property names we defined are okay:
+
+```
+validate-schema -d -s meta.json schema.json
+schema.json is valid.
+```
+
+### Mixing in hyper-schema's meta-schema (#hyper-schema-meta)
+
+You may also notice that the [hyper-schema meta-schema](http://json-schema.org/hyper-schema) uses an `allOf` attribute to make sure that in addition to the constraints it defines, data should also validate against the JSON Schema meta-schema as well. We can do the same thing for our variant except for hyper-schema:
+
+```
+{
+ "$schema": "http://example.com/my-hyper-schema#",
+ "allOf": [
+ {
+ "$ref": "http://json-schema.org/draft-04/hyper-schema#"
+ }
+ ],
+ ...
+}
+```
+
+## Schema endpoint (#schema-endpoint)
+
+A convention that we have at Heroku is to serve the schema itself when a request is made to `GET /schema`. One neat trick is to define the `/schema` link in the schema itself and that its response should validate according to its meta-schema. Using the same mechanism that you'd use to check that a JSON response conforms to its schema, this allows the schema to validate itself against its own meta-schema from your acceptance test suite!
+
+```
+{
+ "href": "/schema",
+ "method": "GET",
+ "rel": "self",
+ "targetSchema": {
+ "$ref": "http://example.com/my-hyper-schema#",
+ }
+}
+```
+
+All the code for the simple hyper-schema and the meta-schema that we've built here are available [on GitHub](https://github.com/brandur/simple-schema).
+
+## Schemas for other media types (#other-media-types)
+
+A final point worth mentioning is that even a Hyper-schema API isn't your thing, [Hyperschema.org has a set of schemas available](http://hyperschema.org/mediatypes/) for other media types, including today's popular hypermedia formats like Collection+JSON, HAL, and UBER.
+
+## Summary (#summary)
+
+To recap, we've used JSON Schema to define the following:
+
+* Individual API resources (app and domain).
+* An API "super schema" that contains all resources in a single document.
+* Hyper-schema links that describe actions on those resources.
+* Schemas that validate incoming requests on each link.
+* Schemas that describe the JSON returned by each link.
+* A meta-schema that validates the conventions of our API's schema.
+
+Although the API itself still needs to be implemented, by combining this schema with the various packages from the HTTP toolchain, we get some nice features for free:
+
+* Generate API documentation with [Prmd](https://github.com/interagent/prmd).
+* Generate a Ruby client with [Heroics](https://github.com/interagent/heroics).
+* Generate a Go client with [Schematic](https://github.com/interagent/schematic), like the one used in Heroku's new CLI, [hk](https://github.com/heroku/hk).
+* Boot a working stub with [Committee](https://github.com/interagent/committee) that will validate incoming requests.
+* Insert a request validation middleware with [Committee](https://github.com/interagent/committee) that will validate incoming request data according to schema before it reaches our stack.
+* Use [Committee's](https://github.com/interagent/committee) test helpers to verify that the responses from our stack conform to the schema.
diff --git a/content/articles/exit-status.md b/content/articles/exit-status.md
new file mode 100644
index 000000000..26b2a35d7
--- /dev/null
+++ b/content/articles/exit-status.md
@@ -0,0 +1,66 @@
+---
+hook: An exercise of discovery around how to extend the shell's API.
+location: San Francisco
+published_at: 2014-09-28T00:50:23Z
+title: Command Exit Status
+---
+
+During a [recent discussion on two factor authentication](https://github.com/heroku/hk/issues/171), the topic of command exit statuses came up. For the shell-uninitiated, an exit status is an integer between 0 and 255 returned when a program exits, usually readable by running `echo $?`. This is in effect one of the key pieces of the API which shells use to communicate with the programs that they run.
+
+First though, a little background: when building out hk, a strong philosophy was adopted around most commands being non-interactive by default (with a few well-known exceptions like `hk login`). This is a nice characteristic when attempting to compose hk directives into something like a shell script; at no point will a command unexpectedly prompt for user input and possibly ruin automation.
+
+With the addition of 2FA to the Heroku API, there is a new possibility of 2FA being arbitrarily required for an API call in that the behavior may vary based on particular endpoints, but also based on the parameters of the request. For example, a particularly sensitive app may require that a two factor code is specified for most of its critical operations. The current CLI handles this by simply prompting the user as needed, but hk's principle of non-interactivity makes it less obvious how to support this.
+
+A decision was made to error when a two factor challenge was detected, but with a well-known exit status that would signal to other programs that the command failed due to a 2FA problem. A smart script would then be able to back-off and perform an appropriate action; say to e-mail its operator to indicate that a new two factor code was needed.
+
+This led to the question of which exit status to return. It's fairly common knowledge that in Bash-like shells, status 0 signals success and that status 1 is an error. A misuse of the program can either be signaled by 1 (as demonstrated many programs including `git` or `ls`), or possibly a 2, which signals the misuse of a shell built-in (hk uses 2 to differentiate this class of errors from other types of failures signaled by 1). When a program receives a fatal signal, it will exit with a code of 128 + `n` where `n` is the signal code. For example, for a program sent signal 2 (`SIGINT`, or more commonly thought of as `Ctrl+C`):
+
+```
+$ curl -n https://api.heroku.com/apps
+^C
+
+$ echo $?
+130
+```
+
+The Advanced Bash-script Guide lists a number of other [reserved exit codes](http://tldp.org/LDP/abs/html/exitcodes.html). Some attempt at standardization has also been made in the kernel header `sysexits.h`:
+
+``` c
+#define EX_OK 0 /* successful termination */
+
+#define EX__BASE 64 /* base value for error messages */
+
+#define EX_USAGE 64 /* command line usage error */
+#define EX_DATAERR 65 /* data format error */
+#define EX_NOINPUT 66 /* cannot open input */
+#define EX_NOUSER 67 /* addressee unknown */
+#define EX_NOHOST 68 /* host name unknown */
+#define EX_UNAVAILABLE 69 /* service unavailable */
+#define EX_SOFTWARE 70 /* internal software error */
+#define EX_OSERR 71 /* system error (e.g., can't fork) */
+#define EX_OSFILE 72 /* critical OS file missing */
+#define EX_CANTCREAT 73 /* can't create (user) output file */
+#define EX_IOERR 74 /* input/output error */
+#define EX_TEMPFAIL 75 /* temp failure; user is invited to retry */
+#define EX_PROTOCOL 76 /* remote error in protocol */
+#define EX_NOPERM 77 /* permission denied */
+#define EX_CONFIG 78 /* configuration error */
+
+#define EX__MAX 78 /* maximum listed value */
+```
+
+So back to our original problem: what exit code should we choose to signal a 2FA challenge error? It turns out that the answer to this question is not perfectly clear, as no official methodology exists for choosing user-defined codes. Once again, the Advanced Bash-scripting Guide comes in with a helpful suggestion:
+
+> There has been an attempt to systematize exit status numbers (see /usr/include/sysexits.h), but this is intended for C and C++ programmers. A similar standard for scripting might be appropriate. The author of this document proposes restricting user-defined exit codes to the range 64 - 113 (in addition to 0, for success), to conform with the C/C++ standard. This would allot 50 valid codes, and make troubleshooting scripts more straightforward.
+
+This seems like as good of a system as anything! If we then skip the codes found in `sysexits.h`, we get a starting value for user codes of 79, which is the code that [we decided to start with in hk](https://github.com/heroku/hk/pull/173):
+
+```
+$ hk env -a paranoid
+error: A second authentication factor or pre-authorization is required
+for this request. Your account has either two-factor or a Yubikey
+registered. Authorize with `hk authorize`.
+
+$ echo $?
+79
+```
diff --git a/content/articles/free-certificates.md b/content/articles/free-certificates.md
new file mode 100644
index 000000000..9225e4583
--- /dev/null
+++ b/content/articles/free-certificates.md
@@ -0,0 +1,198 @@
+---
+hook: Getting a certificate that most browsers would accept used to be difficult and
+ expensive, but that's changing as we enter a relative golden age of web encryption.
+ Read this for options for getting certificates issued for free.
+location: Vancouver
+published_at: 2016-01-31T17:48:53Z
+title: A Guide to Free CA-Signed Certificates
+---
+
+Securing a website used to be an expensive process. Although certificates have
+been slowly getting cheaper, they've still on par with the cost of the domain
+name that they're protecting, and getting one issued was often complex and
+error prone. Furthermore, in a pre-[SNI][sni] world, HTTPS connections needed
+to be terminated at a unique IP address, making it prohibitively expensive for
+hosts to offer low cost encryption to their users.
+
+In an attempt to unwind some of the mistakes that were made around security in
+the earlier ages of the Internet, browsers are starting to prod service providers
+in the right direction. For example, in the near future [Chrome will start
+shaming websites that aren't encrypted][chromeshame] and [Firefox will start
+red flagging login forms that come in over HTTP][firefoxshame].
+
+The good news is that we're now living in a golden age of secure connections.
+The price of CA-signed certificates is trending toward zero, and if you're a
+savvy user who knows where to look, you can easily get one for free already
+today. Support for SNI is now widespread enough that hosts have a cheap
+mechanism for offering secure termination for all their users. Encryption may
+be especially critical for banks and Facebook, but it belongs on every site
+online: shopping sites (even pre-checkout), blogs, marketing landing pages,
+personal websites, and everything in between. Hopefully by reading this guide,
+you'll realize that there aren't any excuses for running an insecure website
+anymore, so come on, let's encrypt!
+
+## Services (#services)
+
+### CloudFlare (#cloudflare)
+
+**Website:** [www.cloudflare.com][cloudflare]
+
+Although CloudFlare is largely known for being a CDN, they've been more quietly
+offering a great certificate-issuing and TLS terminating service for some time
+now. It's easy to use, and is especially ideal for anyone who's hosting content
+on another service that already offers secure termination (like Heroku or
+GitHub pages), but who would like to have a custom domain name. You also get
+the added benefit of CloudFlare's CDN services, which can be had for free.
+
+The good:
+
+* Unbelievably easy. Especially if you're already hosting your DNS with them,
+ getting a secured endpoint involves as little as creating a new record,
+ specifying a target origin, and clicking the little cloud icon to turn it on.
+* Automatic rotation. You don't even have to know or understand what's
+ happening, but can restly safely reassured that your users will have
+ continued access to your services without suddenly getting hit with a red
+ expired certificate page because one of your ops people forgot to get a new
+ one issued.
+
+The bad:
+
+* Certificates are local to CloudFlare, and you can't export them and bring
+ them with you. Using CloudFlare's CDN services is appropriate to just about
+ everyone though, so that's fine for a lot of different cases, but
+* CloudFlare still defaults to "flexible mode" that allows a target origin
+ server to be serving content over HTTP even if CloudFlare itself is
+ terminating over HTTPS. This option is provided for user convenience (and it
+ should be noted that it's still better than no TLS), but allows unwary users
+ to unwittingly build themselves an unsafe setup.
+
+I should also note for the pundits that CloudFlare's magic works by by SNI, and
+as such may not work for clients that are using absolutely ancient technology
+for browsing. As of today, ["Can I Use ...?" estimates support at 97+%
+globally][caniuse], so an SNI-based solution is probably appropriate for you as
+long as you're running an operation that's smaller than Google.
+
+### Let's encrypt (#lets-encrypt)
+
+**Website:** [letsencrypt.com][letsencrypt]
+
+Let's Encrypt is free CA run by the ISRG (Internet Security Research Group)
+with the charter of providing free certificates in an open and transparent way
+to help secure the Internet. They're been making waves lately, and the turning
+point that we're seeing around the cost of CA-signed certificates on the
+Internet could reasonably be attributed to their work.
+
+The good:
+
+* Let's Encrypt is by far the most flexible of any of these solutions in that
+ they'll issue a certificate with private key and all. That means you can take
+ these certificates with you and use them with any other service of your
+ choice.
+* They've built out a great set of tools that allow you to easily get a
+ certificate safely installed for common web servers like Nginx or Apache.
+* They've been working hard on building a standardized protocol to verify
+ domain ownership and issue certificates called [ACME][acme] which will help
+ further commoditize Internet security, and curb user error during the issuing
+ process.
+* Let's Encrypt is a project built in collaboration with the Linux Foundation,
+ and they don't have any hidden agendas (or relative to any of these other
+ services as least). You can feel good about yourself for using the service.
+
+The bad:
+
+* No wildcard certificates for the foreseeable future. That said, a great API
+ that allows for easy automation goes a long way towards compensating for
+ this.
+
+### AWS certificate manager (#acm)
+
+**Website:** [aws.amazon.com/certificate-manager][acm]
+
+A brand new entrant is AWS Certificate Manager (ACM), which finally gives us
+the missing link for building secure services on Amazon. ACM is AWS-only, but
+is easy to use through either their API or web console, and plugs right into a
+CloudFront distribution or ELB (Elastic Load Balancer).
+
+The good:
+
+* ACM will issue wildcard certificates for free, a feature which isn't
+ currently available from any other provider. This isn't a profound difference
+ given that certificates are free anyway, but it saves you from having to
+ re-issue a certificate for every new domain you deploy. It's perfect if you
+ have a microservices-like setup hosted on AWS.
+* Like with CloudFlare, you automatic certificate rotation. This kind of peace
+ of mind is worth paying for, but you'll get it for free.
+* ACM finally gives us a free (or at least low-cost) way of protecting
+ statically built websites served out of S3 buckets. Just create a bucket, a
+ certificate in ACM, and a CloudFront distribution, link them all together,
+ and you're done.
+
+The bad:
+
+* Like CloudFlare, certificates are local to Amazon and can't be exported. But
+ if you're on Amazon, there'a a pretty good chance that you're _all_ on
+ Amazon, and this won't be a huge problem.
+
+### StartSSL (#startssl)
+
+**Website:** [www.startssl.com][startssl]
+
+**Important update (2018/03/12):** I've left this section
+in place for historical reference, but StartSSL is no
+longer issuing certificates after a series of questionable
+practices led to their root certificates being revoked from
+Chrome and Firefox. You can read more about the sordid
+details [on Wikipedia][startcomwiki].
+
+Event though StartSSL is probably not what most people want to use to get
+certificates created these days, I'm still going to give them an honorable
+mention because they were _the original_ free issuer, and really helped to get
+the ball rolling towards a better future.
+
+The good:
+
+* This operation has been around for years and they know what they're doing.
+ Being a fully fledged certificate authority, you'll get an easy upgrade path
+ if you need something a little heavier like an EV cert.
+
+The bad:
+
+* Getting certificates issued is a totally manual process and someone will need
+ to walk through it periodically to make sure your services stay online. This
+ used to be an unavoidable reality, but we've got much better options these
+ days.
+
+The ugly:
+
+* StartCom uses a client certificate system to log into their control panel and
+ the certificate issuing flow is long and fairly obtuse. Only more advanced
+ users will be able to understand what's going on here and get a certificate
+ issued safely.
+
+## Summary (#summary)
+
+There's a lot of information above, so here's a simple heuristic that should do
+the trick for most people:
+
+* If you're hosted on Amazon, you should use [ACM][acm].
+* If you're hosted on another service that gives you some kind of secure
+ terminate (like Heroku or GitHub Pages), you should use
+ [CloudFlare][cloudflare].
+* Otherwise, you should use [Let's Encrypt][lets-encrypt].
+
+For example, this site runs on Heroku. I have my domain terminated by
+CloudFront at "https://brandur.org", and CloudFront securely transports content
+from my HTTPS Heroku address at "https://brandur-org-next.herokuapp.com".
+
+That's it! Now please go out and secure your web properties.
+
+[acm]: https://aws.amazon.com/certificate-manager/
+[acme]: https://github.com/ietf-wg-acme/acme/blob/master/draft-ietf-acme-acme.md
+[caniuse]: http://caniuse.com/#feat=sni
+[chromeshame]: https://motherboard.vice.com/read/google-will-soon-shame-all-websites-that-are-unencrypted-chrome-https
+[cloudflare]: https://www.cloudflare.com/
+[firefoxshame]: https://hacks.mozilla.org/2016/01/login-forms-over-https-please/
+[letsencrypt]: https://letsencrypt.org/
+[sni]: https://en.wikipedia.org/wiki/Server_Name_Indication
+[startcomwiki]: https://en.wikipedia.org/wiki/StartCom
+[startssl]: https://www.startssl.com/
diff --git a/content/articles/go-lambda.md b/content/articles/go-lambda.md
new file mode 100644
index 000000000..ca69c4b68
--- /dev/null
+++ b/content/articles/go-lambda.md
@@ -0,0 +1,202 @@
+---
+title: "Speed and Stability: Why Go is a Great Fit for
+ Lambda"
+location: San Francisco
+published_at: 2018-01-17T16:31:34Z
+hook: Why Go's stability and simple deployments is a
+ good fit for a serverless environment.
+hn_link: https://news.ycombinator.com/item?id=16169592
+---
+
+A few days ago in a move foreshadowed by a hint at Amazons'
+re:Invent conference late last year, [AWS released support
+for Go on its Lambda platform][announce].
+
+Go users can now build programs with typed structs
+representing Lambda event sources and common responses in
+the [`aws-lambda-go` SDK][sdk]. These can then be compiled,
+bundled up into a ["Lambda deployment package"][package]
+(as simple as a ZIP file with a binary in it), and added to
+a new Lambda function by selecting "Go 1.x" as a runtime.
+
+!fig src="/assets/go-lambda/create-function.png" caption="Prompt for creating a new function on Lambda."
+
+Go fans around the world are undoubtedly celebrating the
+addition, but Gopher or not, this is a big step forward for
+everyone. Go may have its share of problems, but it has a
+few properties that make it an absolutely ideal fit for a
+serverless environment like Lambda.
+
+## Lambda runtimes are special snowflakes (#runtimes)
+
+Lambda's exact implementation details have always been a
+little mysterious, but we know a few things about them.
+User processes are started in sandboxed containers, and
+containers that have finished their execution may be kept
+around to service a future invocation of the same Lambda
+function (but might not be). Between function invocations
+containers are frozen, and no user code is allowed to
+execute.
+
+Containers also flavored with one of the [preconfigured
+runtimes][runtimes] allowed by Amazon (this list is current
+as of January 16th):
+
+* **Node.js** – v4.3.2 and 6.10.3
+* **Java** – Java 8
+* **Python** – Python 3.6 and 2.7
+* **.NET Core** – .NET Core 1.0.1 and .NET Core 2.0
+* **Go** – Go 1.x
+
+That's a pretty good variety of languages, but more
+interesting is what's *missing* from the list. While .NET
+Core and Python are relatively up-to-date, Java 9 is
+absent, along with any recent major version of Node (7.x,
+8.x, or 9.x). Notably, major features like `async/await`
+(which landed in Node ~7.6) are still not available on
+the Lambda platform even a year after release.
+
+These holes tell us something else about Lambda: new
+runtimes are non-trivial to create, run, and/or maintain,
+so updated versions often lag far behind their public
+availability. Given that Lambda [will be four years old
+this year][history], it doesn't seem likely that Amazon
+will be able to to address this deficiency anytime soon.
+
+!fig src="/assets/go-lambda/mountain.jpg" caption="Go 1.x's longevity is so impressive that it feels like a part of the landscape."
+
+## The remarkable tenacity of 1.x (#tenacity)
+
+That brings us back to Go. Lambda's Go runtime specifies
+version "1.x". At first glance that might not look all that
+different from other languages on the list, but there's a
+considerable difference: Go 1 was first released almost
+_six years ago_ in [March 2012][releases]!
+
+Since then, Go has followed up with nine more releases on
+the 1.x line (and with a tenth expected soon), each of
+which carried significant improvements and features. And
+while it's rare to ever have a _perfect_ release that
+doesn't break anything, Go's done as good of a job as is
+practically possible. Generally new releases are as pain
+and worry-free as changing one number in a `.travis.yml`.
+
+This level and length of API stability for a programming
+language is all but unheard of, and it's made even *more*
+impressive given that Go is one of the most actively
+developed projects in the world -- a far shot from being
+stable only because it's stagnant. The only way this
+remarkable feat has been made possible is that (presumably)
+having experienced the pain involved in the API changes
+that come along with most language upgrades, Go's team has
+made stability a core philosophical value.
+
+There's [an entire article][go1] dedicated to the policies
+around stability for the 1.x line. Here's an excerpt where
+they explicitly call out that programs written for 1.x
+should stay working for all future versions of 1.x:
+
+> It is intended that programs written to the Go 1
+> specification will continue to compile and run correctly,
+> unchanged, over the lifetime of that specification. At
+> some indefinite point, a Go 2 specification may arise,
+> but until that time, Go programs that work today should
+> continue to work even as future "point" releases of Go 1
+> arise (Go 1.1, Go 1.2, etc.).
+>
+> The APIs may grow, acquiring new packages and features,
+> but not in a way that breaks existing Go 1 code.
+
+This might sound like normal [semantic versioning][semver]
+(semver), but semver only dictates what to do in the event
+of a breaking change. It doesn't say anything about
+frequency of change, or committing to not making breaking
+changes. Go's proven track record in this area puts it well
+ahead of just about any other project.
+
+That brings us back to Lambda. If we look back at our list
+of runtimes, the supported versions across languages might
+not look all that different, but it's a reasonably safe bet
+that the "Go 1.x" in that list is going to outlive every
+other option, probably by a wide margin.
+
+## Static binaries, dependencies, and forward compatibility (#static)
+
+[The Lambda guide for Go][guide] suggests creating a
+function by building a statically-linked binary (the
+standard for Go), zipping it up, and uploading the whole
+package to AWS:
+
+``` sh
+$ GOOS=linux go build -o main
+$ zip deployment.zip main
+$ aws lambda create-function ...
+```
+
+This is in sharp contrast to other support languages where
+you send either source-level code (Node, Python), or
+compiled bytecode (.NET Core, Java). Static binaries have
+some major advantages over both of these approaches.
+
+Static linking removes the need for a dependency deployment
+system, which is often a heavy part of other language
+stacks. Anything that's needed by a final program is linked
+in at compile time, and once a program needs to execute, it
+doesn't need to think about project layout, include paths,
+or requirements files. Source-level dependency management
+has been a long criticized blindspot of Go, but with the
+addition of the `vendor/` directory in Go 1.6 and rapid
+uptake on the new [`dep`][dep] dependency management tool,
+the future is looking brighter than ever.
+
+Static binaries also carry the promise of forward
+compatibility. Unlike even a bytecode interpreter, when a
+new version of Go is released, the Lambda runtime may not
+necessarily need an update given that existing containers
+will be able to run the new binary. Time will tell for
+sure, but unlike Node users who are still transpiling to
+get `async/await` on Lambda, Go users should be able to
+push updated programs on the release day of a new version
+of Go [1].
+
+## Stability as a feature (#stability)
+
+It's rare to write software and not have it come back to
+haunt you in a few year's time as it needs to be fixed and
+upgraded. In a craft generally akin to the shifting sands
+in a whirling windstorm, Go is a rare oasis of stability.
+More recently there has been some speculation as to [what
+Go 2.0 might look like][go2], there are still no concrete
+plans for any major breaking changes, and that's a feature.
+
+Along with the languages normal strengths -- incredible
+runtime speed, an amazing concurrency story, a great
+batteries-included standard library, and the fastest
+edit-compile-debug loop in the business -- Go's stability
+and ease of deployment is going to make it a tremendous
+addition to the Lambda platform. I'd even go so far as to
+say that you might want to consider not writing another
+serverless function in anything else.
+
+[1] This comes with the caveat that `net/rpc` package that
+[`aws-lambda-go` uses for the entrypoint][entrypoint]
+remains stable across versions. This is reasonably likely
+though given that the package has been [frozen][frozen] for
+more than a year, and `net/rpc`'s serialization format,
+`encoding/gob`, [states explicitly][gob] that efforts will
+be made to keep it forward compatible.
+
+[announce]: https://aws.amazon.com/blogs/compute/announcing-go-support-for-aws-lambda/
+[dep]: https://github.com/golang/dep
+[entrypoint]: https://github.com/aws/aws-lambda-go/blob/master/lambda/entry.go
+[frozen]: https://go-review.googlesource.com/c/go/+/32112
+[go1]: https://golang.org/doc/go1compat
+[go2]: https://blog.golang.org/toward-go2
+[gob]: https://golang.org/pkg/encoding/gob/
+[guide]: https://aws.amazon.com/blogs/compute/announcing-go-support-for-aws-lambda/
+[history]: https://docs.aws.amazon.com/lambda/latest/dg/history.html
+[package]: https://docs.aws.amazon.com/lambda/latest/dg/lambda-go-how-to-create-deployment-package.html
+[releases]: https://golang.org/doc/devel/release.html#go1
+[runtimes]: https://docs.aws.amazon.com/lambda/latest/dg/current-supported-versions.html
+[semver]: https://semver.org/
+[sdk]: https://github.com/aws/aws-lambda-go
diff --git a/content/articles/go-worker-pool.md b/content/articles/go-worker-pool.md
new file mode 100644
index 000000000..36bdeca24
--- /dev/null
+++ b/content/articles/go-worker-pool.md
@@ -0,0 +1,328 @@
+---
+title: The Case For A Go Worker Pool
+published_at: 2016-08-19T21:22:23Z
+hook: Error handling and fragility; or why a worker pool belongs in Go's
+ standard library.
+location: San Francisco
+---
+
+When it comes to the question of what the right constructs for concurrency that
+a language should expose to developers, I'm a true believer that Go's channels
+and goroutines are as good as it gets. They strike a nice balance between power
+and flexibility, while simultaneously avoiding the propensity for deadlocks
+that you'd see in a pthread model, the maintenance hell introduced by
+callbacks, or the incredible conceptual overhead of promises.
+
+However, there's one blind spot in Go's concurrency APIs that I find myself
+implementing in new Go programs time and time again: the worker pool (or
+otherwise known as a [thread pool][thread-pool]).
+
+Worker pools are a model in which a fixed number of _m_ workers (implemented in
+Go with goroutines) work there way through _n_ tasks in a work queue
+(implemented in Go with a channel). Work stays in a queue until a worker
+finishes up its current task and pulls a new one off.
+
+Traditionally, thread pools have been useful to amortizing the costs of
+spinning up new threads. Goroutines are lightweight enough that that's not a
+problem in Go, but a worker pool can still be useful in controlling the number
+of concurrently running tasks, especially when those tasks are accessing
+resources that can easily be saturated like I/O or remote APIs.
+
+!fig src="/assets/go-worker-pool/worker-pool.svg" caption="A visualization of a worker pool: few workers working many work items."
+
+Implementing a worker pool in Go is by no means a tremendously difficult feat.
+In fact, [Go By Example][gobyexample] describes one implementation that's only
+a few dozen lines of code:
+
+``` go
+package main
+
+import (
+ "fmt"
+ "time"
+)
+
+func worker(id int, jobs <-chan int, results chan<- int) {
+ for j := range jobs {
+ fmt.Println("worker", id, "processing job", j)
+ time.Sleep(time.Second)
+ results <- j * 2
+ }
+}
+
+func main() {
+ jobs := make(chan int, 100)
+ results := make(chan int, 100)
+
+ for w := 1; w <= 3; w++ {
+ go worker(w, jobs, results)
+ }
+
+ for j := 1; j <= 9; j++ {
+ jobs <- j
+ }
+ close(jobs)
+
+ for a := 1; a <= 9; a++ {
+ <-results
+ }
+}
+```
+
+In this example, 3 workers are started and 9 work items are in put onto a job
+channel. Workers have a work loop with a `time.Sleep` so that each ends up
+working 3 jobs. `close` is used on the channel after all the work's been put
+onto it, which signals to all 3 workers that they can exit their work loop by
+dropping them out of their loop on `range`.
+
+This implementation is meant to show the classical reason that a worker pool
+doesn't need to be in Go's standard library: the language's concurrency
+primitives are already so powerful that implementing one is trivial to the
+point where it doesn't even need to put into a common utility package.
+
+So if primitives alone already present such an elegant solution, why would
+anyone ever argue for introducing another unneeded layer of abstraction and
+complexity?
+
+## Error handling (#error-handling)
+
+Well, there's a simplification in the above example that you may have spotted
+already. While it's perfectly fine if the workload for our asynchronous tasks
+is going to be to multiply an integer by two, it doesn't stand up quite as well
+when work items may or may not have to return an error. And in a real world
+system, you're _always_ going to have to return an error.
+
+But we can fix it! To get some error handling in the program, we introduce a
+new channel called `errors`. Workers will inject an error into it if they
+encounter one, and otherwise put a value in `results` as usual.
+
+``` go
+errors := make(chan error, 100)
+
+...
+
+// check errors before using results
+select {
+case err := <-errors:
+ fmt.Println("finished with error:", err.Error())
+default:
+}
+```
+
+We need to make one other small change too. Because some threads may now output
+over the `errors` channel rather than `results`, we can no longer use `results`
+to know when all work is complete. Instead we introduce a `sync.WaitGroup` that
+workers signal when they finish work regardless of the result.
+
+Here's a complete version of the same program with those changes:
+
+``` go
+package main
+
+import (
+ "fmt"
+ "sync"
+ "time"
+)
+
+func worker(id int, wg *sync.WaitGroup, jobs <-chan int, results chan<- int, errors chan<- error) {
+ for j := range jobs {
+ fmt.Println("worker", id, "processing job", j)
+ time.Sleep(time.Second)
+
+ if j%2 == 0 {
+ results <- j * 2
+ } else {
+ errors <- fmt.Errorf("error on job %v", j)
+ }
+ wg.Done()
+ }
+}
+
+func main() {
+ jobs := make(chan int, 100)
+ results := make(chan int, 100)
+ errors := make(chan error, 100)
+
+ var wg sync.WaitGroup
+ for w := 1; w <= 3; w++ {
+ go worker(w, &wg, jobs, results, errors)
+ }
+
+ for j := 1; j <= 9; j++ {
+ jobs <- j
+ wg.Add(1)
+ }
+ close(jobs)
+
+ wg.Wait()
+
+ select {
+ case err := <-errors:
+ fmt.Println("finished with error:", err.Error())
+ default:
+ }
+}
+```
+
+As you can see, it's fine code, but not quite as elegant as the original.
+
+### Potential fragility (#fragility)
+
+In our example above, we've accidentally introduced a fairly insidious problem
+in that if our error channel is smaller than the number of work items that will
+produce an error, then workers will block as they try to put an error into it.
+This will cause a deadlock.
+
+We can simulate this easily by changing the size of our error channel to 1:
+
+``` go
+errors := make(chan error, 1)
+```
+
+And now when the program is run, the runtime detects a deadlock:
+
+```
+$ go run worker_pool_err.go
+worker 3 processing job 1
+worker 1 processing job 2
+worker 2 processing job 3
+worker 2 processing job 5
+worker 1 processing job 4
+worker 1 processing job 6
+worker 1 processing job 7
+fatal error: all goroutines are asleep - deadlock!
+```
+
+It's quite possible to address that problem as well, but it helps to show that
+developing a useful and bug-free worker pool in Go isn't quite as simple as
+it's often made out to be.
+
+## Implementing a robust worker pool (#implementing)
+
+Putting together a good worker pool abstraction is pretty simple, and can even
+be done reliably with a minimal amount of code. Here's the worker pool
+implementation that builds this website for example (or [on GitHub][github]):
+
+``` go
+import (
+ "sync"
+)
+
+// Pool is a worker group that runs a number of tasks at a
+// configured concurrency.
+type Pool struct {
+ Tasks []*Task
+
+ concurrency int
+ tasksChan chan *Task
+ wg sync.WaitGroup
+}
+
+// NewPool initializes a new pool with the given tasks and
+// at the given concurrency.
+func NewPool(tasks []*Task, concurrency int) *Pool {
+ return &Pool{
+ Tasks: tasks,
+ concurrency: concurrency,
+ tasksChan: make(chan *Task),
+ }
+}
+
+// Run runs all work within the pool and blocks until it's
+// finished.
+func (p *Pool) Run() {
+ for i := 0; i < p.concurrency; i++ {
+ go p.work()
+ }
+
+ p.wg.Add(len(p.Tasks))
+ for _, task := range p.Tasks {
+ p.tasksChan <- task
+ }
+
+ // all workers return
+ close(p.tasksChan)
+
+ p.wg.Wait()
+}
+
+// The work loop for any single goroutine.
+func (p *Pool) work() {
+ for task := range p.tasksChan {
+ task.Run(&p.wg)
+ }
+}
+```
+
+And also simple implementation for the `Task` that goes with it. Note that we
+store errors on the task itself to avoid the problem of a saturated Go channel
+above:
+
+``` go
+// Task encapsulates a work item that should go in a work
+// pool.
+type Task struct {
+ // Err holds an error that occurred during a task. Its
+ // result is only meaningful after Run has been called
+ // for the pool that holds it.
+ Err error
+
+ f func() error
+}
+
+// NewTask initializes a new task based on a given work
+// function.
+func NewTask(f func() error) *Task {
+ return &Task{f: f}
+}
+
+// Run runs a Task and does appropriate accounting via a
+// given sync.WorkGroup.
+func (t *Task) Run(wg *sync.WaitGroup) {
+ t.Err = t.f()
+ wg.Done()
+}
+```
+
+And here's how to run and performing error handling on it:
+
+``` go
+tasks := []*Task{
+ NewTask(func() error { return nil }),
+ NewTask(func() error { return nil }),
+ NewTask(func() error { return nil }),
+}
+
+p := pool.NewPool(tasks, conf.Concurrency)
+p.Run()
+
+var numErrors int
+for _, task := range p.Tasks {
+ if task.Err != nil {
+ log.Error(task.Err)
+ numErrors++
+ }
+ if numErrors >= 10 {
+ log.Error("Too many errors.")
+ break
+ }
+}
+```
+
+## Summary (#summary)
+
+Even though putting together a robust worker pool isn't overly problematic,
+right now it's something that every project needs to handle on its own. The
+size of the pattern is also almost a little _too_ simple for an external
+package, as evidenced by the dozens (if not hundreds) of Go worker pool
+implementations that you can find on GitHub. Coming to community consensus at
+this point on a single preferred third party package would be nigh impossible.
+
+It seems to me that this is one easy place that the Go maintainers team could
+help guide developers and prevent a wild amount fracturing by introducing a One
+True Path. I'd love to see a worker pool in core.
+
+[github]: https://github.com/brandur/sorg/tree/master/pool
+[gobyexample]: https://gobyexample.com/worker-pools
+[thread-pool]: https://en.wikipedia.org/wiki/Thread_pool
diff --git a/content/articles/go.md b/content/articles/go.md
new file mode 100644
index 000000000..f2c689300
--- /dev/null
+++ b/content/articles/go.md
@@ -0,0 +1,246 @@
+---
+hook: Notes on the language after spending a few weeks building a large project in
+ it.
+location: San Francisco
+published_at: 2016-03-31T20:37:11Z
+title: Notes on Go
+---
+
+Despite having worked for so many years with colleagues who were major
+proponents (or even contributors) to Go/Golang, I'd somehow gone this long
+without having ever written a production grade service in the language, having
+only dabbled so far with toy projects, small scripts, and minor contributions
+to existing software. That all changed in the last few weeks, where I had the
+opportunity to assemble a pretty serious project in the language from scratch.
+
+I took notes throughout the process so as not to lose my (relatively) fresh
+outlook on the language.
+
+## The important things (#important)
+
+### Simple, but verbose (#verbose)
+
+Building new programs requires typing **a lot**. The language is incredibly
+verbose and has few shortcuts. The upside is that once you have typed out
+that initial code, it's eminently readable and relatively easy to maintain
+compared to many other languages.
+
+All too often, the bar to understanding projects written in Ruby, Lisp,
+Haskell, Rust, C#, C++, or whatever, isn't _just_ figuring out the code, it's
+also deciphering the localized (and often overcomplicated) abstractions that
+every developer ends up baking into their code to reduce line count, and in
+many cases that's significant cognitive overhead. In Go that problem doesn't
+exist to anywhere near the same extent.
+
+### Concurrency (#concurrency)
+
+After spending some time with them, I'm firmly convinced that green threads
+(Goroutines) and channels are the only way that concurrency should be exposed
+to developers.
+
+When working in languages like Ruby (to pick an example of one that I'm very
+familiar with), even with experience, doing any work with concurrency is
+incredibly frustrating in just the sheer number of problems that you're likely
+to run into. It's tempting to think that this is because concurrency is
+inherently difficult, but it's more to do with dull primitives that are error
+prone by default.
+
+By comparison, when working in Go, it's amazing how many programs you can write
+where your concurrent code will work _perfectly_ the first time. I also find
+that even in cases where it doesn't, it's far more often due to a conceptual
+mistake that I've made than it is to a poorly designed language feature.
+
+I also appreciate just how opinionated the Go team has been on this front.
+Other languages with strong concurrency stories like Haskell and Rust have
+opted to give users access to every type of primitive under the sun, and in the
+long run that tyranny of choice leads to an ecosystem of mixed ideas and no
+clear way forward.
+
+### Speed (#speed)
+
+Speed is absolutely critical, and not just for the runtime, but _for the
+tooling_. Being able to compile and run your entire test suite in under a
+second [1] changes the entire development experience in such a fundamental way
+that it's hard to adequately describe. After working with a Go project for a
+while, going back to 10+ second iteration loops in languages like Ruby feels
+like trying to run a marathon while waist deep in a bog.
+
+This is still a key place where Go stands apart even from other modern
+languages which tend to focus on runtime speed or sophisticated features while
+ignoring the development cycle [2].
+
+But Go is also fast at runtime too. It's nice to be able to write code in a
+high-level language and be able to trust that it will run quickly.
+
+### Deployment (#deployment)
+
+If every language was as easy to deploy as Go, Docker would never have been
+invented (maybe a slight exaggeration, but the need for it wouldn't have been
+anywhere near as dire). Build a binary. Copy it to a target server. Stop the
+old service and bring the new one up. That's it. No weird environment problems.
+No dependency headaches. No Bundler.
+
+I now write all my throwaway scripts in Go for the same reason. If I ever need
+to run one with Cron, I know that I'm never going to have to deal with issues
+with `$PATH` or rbenv or anything else. Copy the executable to
+`/usr/local/bin`, inject it straight into my Crontab, and you're done.
+`killall` even works; incredible.
+
+## Other notes (#other)
+
+### The good (#good)
+
+There's a lot to like about Go:
+
+* **Defer:** I love this abstraction. Although not quite as safe as something
+ like a C# `using` block (in that you might accidentally remove the line and
+ not notice), it's far less cluttering.
+* **Import:** I'm firmly convinced now that importing packages with a short
+ canonical identifier (e.g. `fmt` or `http` from "net/http") and then having
+ only have one option for referencing that package in code is the One True
+ Way. No more symbols with unknown and dubious origin (Haskell) or artisanal
+ blends of qualified and non-qualified names (C#/Java/other).
+* **Select:** Although decisions like using `default:` to make a `select`
+ blocking or non-blocking are a little obtuse, overall this construct is
+ incredibly powerful.
+* **Pipelines:** By combining a few of the built-in language features, Go
+ enables the construction of composable, scalable [pipelines][pipelines]. This
+ approach to parallelism is truly elegant and encourages developers to write
+ programs that are more performant and which make better use of modern system
+ resources (namely, more cores).
+* **Labels:** Incredibly useful for breaking out of an outer loop without
+ introducing boilerplate. When used carefully, `goto` is also tremendously
+ powerful (and comes with the perfect number of restrictions to prevent its
+ abuse).
+* **No metaprogramming and minimal OO:** Sometimes the costs of what seem like
+ good features on the surface outweigh their benefits. I'll gladly write a
+ little more code if it means that someone else will be able to understand it.
+* **Static linking:** Go didn't invent this, but they did make it default.
+ Static linking introduces some headaches in a few cases, but vastly improves
+ the lives of the other 99% of users.
+* **Standard library in Go:** It's an amazing feature to be able to check the
+ implementation of core packages in the standard libraries. This isn't all
+ that unusual for newer languages these days, but it's becoming increasingly
+ harder to make the argument that languages like Ruby and Python that insist
+ that having a standard library written in C is just fine.
+* **Nice documentation features:** Go has some neat innovations in
+ documentation that solve real problems that are observable in almost every
+ other language and framework. e.g. A locally runnable documentation server
+ (great for flights), or [testable examples][testable-examples], which mean
+ that examples in documentation get run with the test suite so that they don't
+ fall out of date.
+
+### The surprisingly good (#surprisingly-good)
+
+There were a lot of facets of Go that I read or heard about before trying and
+which I was pretty sure that I wouldn't like. However, after using the language
+a while I quickly started warming up to them:
+
+* **Dependency management:** It took me a while to warm up to Go's design
+ around dependency management, but not having to run and manage everything
+ through a slow and complex system like Bundler hugely improves the
+ development experience. It also makes it very easy to jump into foreign
+ libraries and examine their implementations when necessary.
+* **Gofmt:** Having a single convention for the language makes collaboration
+ easier, and makes my own coding faster (in that I can rely on gofmt to
+ correct certain things).
+* **Errors on unused variables:** These can be very annoying, but I can't deny
+ that these error messages have saved me from what would otherwise have been a
+ bug multiple times now.
+* **No generics:** Having types only on special data structures like slices and
+ maps gets you surprisingly far. Although not having generics does make using
+ the language for certain things difficult, I was amazed after having built a
+ multi-thousand LOC program to realize that I hadn't wanted for them once.
+
+### The bad (#bad)
+
+I really did make an effort, but even so, some parts of the language are hard
+to love:
+
+* **Error handling:** I like that generally my programs don't crash, but
+ dealing with errors requires an incredible level of micro-management. Worse
+ yet, the encouraged patterns of passing errors around through returns can
+ occasionally make it very difficult to identify the original site of a
+ problem.
+* **The commmunity:** Reading the mailings lists can be still be pretty
+ depressing. Every critique of the language or suggestion for improvement, no
+ matter how valid, is met with a barrage of either "you're doing it wrongs",
+ or "only the Go core team that can have thoughts that are worth
+ consideration" [3]. Previously this level of zealotry had been reserved for
+ holy crusades and text editors.
+* **Debugging:** gdb and pprof both work with Go, but with experiences that are
+ rough enough around the edges that you'll find yourself often resorting to
+ print-debugging just to avoid the hassle.
+* **Noisy diffs:** The downside of gofmt is the possibility of noisy diffs. If
+ someone adds a new field with a long name to a large struct, all the spacing
+ changes and you end up with a huge block of red and a slow review [4].
+* **Quirky syntax and semantics:** Go is littered with quirky language syntax
+ and semantics that are fine once you know them, but are unnecessarily opaque.
+ Some examples:
+ 1. The distinction between `new`, `make`, and initialization with composite
+ literals.
+ 2. Interfaces are always references.
+ 3. Symbols that start with capital letters are exported from packages.
+ 4. Channels created without a size are blocking.
+ 5. `select` blocks with a `default` case become non-blocking.
+ 6. You check if a key exists in a map by using a rare second return value
+ of a normal lookup with square brackets.
+ 7. Named return values.
+ 8. Closing a channel causes any Goroutine that was listening on it to fall
+ through having received a zero value of the channel's type.
+ 9. Comparing interfaces to `nil` is allowed by the compiler, but apparently
+ not a good idea, and can lead to some strange bugs.
+* **JSON:** Is [as slow as reported][slow-json] due to its extensive use of
+ reflection. This wouldn't seem like it should be a problem, but can lead to
+ surprising bottlenecks in otherwise fast programs.
+
+### The ugly (#ugly)
+
+There are very few parts of the language that are unapologetically bad, but
+that said:
+
+* **Assertions:** Although mostly palatable, the omission of a collection of
+ meaningful assert functions (and the corresponding expectation that a
+ custom-tailored message should be written every time you want to check that
+ an error is nil) isn't great to say the least.
+
+ The real problem though is that the unnecessary verbosity of tests acts as a
+ natural deterrent to writing them, and projects with
+ non-existent/poor/incomplete test suites are a dime a dozen. I've been using
+ the [testify require package][testify] to ease this problem, but there should
+ be answer in the standard library.
+
+## Summary (#summary)
+
+Overall, I never quite reached the level of feverish passion for Go that many
+others have, but I consider it a thoroughly solid language that's pleasant to
+work with. Furthermore, it may have reached the best compromise that we've seen
+so far in the landscape of contemporary languages in that it imposes
+constraints that are strong enough to detect large classes of problems during
+compilation, but is still fluid enough to work with that it's easy to learn,
+and fast to develop.
+
+[1] Without tricks like Zeus that come with considerable gotchas and side
+ effects.
+
+[2] e.g. Rust, or, and it hurts me to say this, Haskell.
+
+[3] The best single example of this that I've found so far is [a request for a
+ non-zero exit code in Golint][golint]. The community articulates the
+ problem and shows an obvious demand and willingness to help. Meanwhile the
+ member of Go core can't manage to build even a single cohesive
+ counterargument, but even so, the issue along with all its ideas and
+ suggestions are summarily rejected.
+
+**Follow-up (2016/06/24):** Russ Cox re-opened the issue (presumably after
+seeing complaints on Twitter given that the original maintainer had locked
+discussion on it) and it was subsequently resolved.
+
+[4] `?w=1` on GitHub to hide whitespace changes helps mitigate this problem,
+ but isn't the default, and doesn't allow comments to be added.
+
+[golint]: https://github.com/golang/lint/issues/65
+[pipelines]: https://blog.golang.org/pipelines
+[slow-json]: https://github.com/golang/go/issues/5683
+[testable-examples]: https://blog.golang.org/examples
+[testify]: https://github.com/stretchr/testify#require-package
diff --git a/content/articles/golang-packages.md b/content/articles/golang-packages.md
new file mode 100644
index 000000000..00c51c84a
--- /dev/null
+++ b/content/articles/golang-packages.md
@@ -0,0 +1,73 @@
+---
+hook: Understanding the benefits of Golang's restrictive (but simple) import and package
+ management system.
+location: San Francisco
+published_at: 2014-10-20T00:54:44Z
+title: Package Management in Go
+---
+
+Go's strategy for package management is a little untraditional by the standards of what most language ecosystems are setting today. Nearly every other language that's under active development today has opted for an approach to dependency management that involves central repositories accompanied by a recipe that's checked in with projects with instructions on how to rebuild the dependency tree that they need to run: Rubygems in Ruby, NPM in Node, Maven for the JVM, Cargo for Rust, etc. Go's more exotic approach can be a little harder for new entrants to understand, especially if they're coming in from other languages.
+
+Personally, some basic principles left me reeling. Here's an [excerpt from the Go FAQ](http://golang.org/doc/faq#get_version) suggesting that maintainers of public packages fork their projects if they're introducing a backwards incompatible change:
+
+> Packages intended for public use should try to maintain backwards compatibility as they evolve. The Go 1 compatibility guidelines are a good reference here: don't remove exported names, encourage tagged composite literals, and so on. If different functionality is required, add a new name instead of changing an old one. If a complete break is required, create a new package with a new import path.
+
+You have to fork project to change the API?! It's hard to believe how this could be considered correct in any world, and many Go articles are so apologetic that downsides of this approach are never addressed. For those who were as confused about this as I was, let's address a few of the special characteristics of Go's import and packaging system.
+
+## The workspace (#workspace)
+
+At the heart of Go's conventions is [the workspace](https://golang.org/doc/code.html#Workspaces). A workspace is simply any directory that your `$GOPATH` is currently referencing, and which has a basic layout like the following:
+
+```
+bin/
+pkg/
+src/
+```
+
+Whatever project you're working on should be in _a_ workspace at a location like `$GOPATH/src/github.com/brandur/heroku-agent`. When you introduce a dependency in an `import` statement, you're actually telling the compiler to resolve it within your workspace, even if it looks like that dependency comes from a remote host:
+
+```
+import (
+ "github.com/brandur/my-dependency"
+)
+```
+
+`go get` can retrieve your project's dependencies from a variety of providers and store them to your workspace for compilation.
+
+## DVCS & the central repository (#dcvs)
+
+A common line in the community is that Go's package system is "built on top of distributed version control", which reads like it's adding some extra layer of robustness on top of a more traditional packaging system. Although nominally true, the far and away popular convention is to reference DVCS hosts like GitHub or Google Code, which are no more distributed or robust than RubyGems. The operation of `go get` isn't too radically different than a `bundle install --local`.
+
+This doesn't eliminate any central point of failure, but it does have the notable advantage of making the community less dependent on a single central repository that's quite expensive to maintain. Central repositories like RubyGems and NPM owe their continued existence and development to largely charitable sponsorship. Although this has traditionally worked quite well, this may not last forever, especially if either community loses some of the impressive public support that they currently enjoy. Go's approach allows support for the DCVS providers du jour to be added or removed as necessary; at the end of the day, the only common functionality required of a provider or source control system is to be able to check out source code to a known path.
+
+## No relative imports (#no-relative-imports)
+
+One of the most astonishing aspects of Go's import for me was that any kind of [relative import is strongly discouraged](https://groups.google.com/forum/#!topic/golang-nuts/_usbgS9LeS8) (relative import is only allowed outside a workspace). This seems reasonable when referencing external dependencies, but is pretty inconvenient when building more complex Go projects that are divided into subpackages where convention is still to fully qualify everything:
+
+``` go
+import (
+ "github.com/goraft/raft/protobuf"
+)
+```
+
+This syntax leaves a lot of open questions: am I actually referencing the master branch on GitHub? Do I have to push changes in my subpackages up before I can use them in main? Well, yes to the former, but not the latter. Once again, this approach is made tenable by workspaces: `import` statements always reference code within your workspace, but can contain a provider so that they can be fetched by `go get`. The code within the current workspace is what gets used for compilation, even if it's deviated from what's in the origin's master branch.
+
+Removing the concept of a relative important has advantages as well: understanding of the local file hierarchy is no longer required to build a package, making paths in Go packages easier to reason about. This guarantees nice convention between projects; at no point in time is it necessary to detangle a project's exotic approach to organization.
+
+It's also astoundingly good for open source in that it's trivial to find, inspect, and manipulate your project's dependencies when it becomes necessary to do so. Every package's location is explicitly implied by its import path and easy to find.
+
+## Vendoring (#vendoring)
+
+So importing from master is great, but any kind of non-trivial program will eventually need a way to make deployments repeatable, which means locking down dependencies. Again, the [Go FAQ](http://golang.org/doc/faq#get_version) somewhat surprisingly recommends that dependencies be locked by vendoring them into a project:
+
+> If you're using an externally supplied package and worry that it might change in unexpected ways, the simplest solution is to copy it to your local repository. (This is the approach Google takes internally.) Store the copy under a new import path that identifies it as a local copy. For example, you might copy "original.com/pkg" to "you.com/external/original.com/pkg". Keith Rarick's goven is one tool to help automate this process.
+
+This technique raises questions around repository cleanliness in that a lot of extraneous source code gets ported around with the import pieces. Go diffs can be nightmarish to read.
+
+However, it does have the advantage of making builds not dependent on the availability of external services. It also avoids any dependency hell type problems where two dependencies rely on different versions of a third.
+
+## It's about simplicity (#simplicity)
+
+Like everything in Go, the import system is based on the same fundamental principle of simplicity that the [rest of the language encourages](http://bradgignac.com/2014/09/24/avoiding-complexity-with-go.html). Packages are resolved using he same popular version control systems that you use to store your source code. Packages are housed in the same location as your project (the `$GOPATH`). There is no versioning of any kind; the compiler ingests whatever code is on disk. Everything can be resolved and built by the Go compiler without any other special tooling.
+
+The merits of Go's approach compared to other languages is certainly disputable, but the refreshing minimality of Go's system can't be easily dismissed. Having recently spent an hour loosening version constraints in upstream the dependencies of a large Ruby app, and another two trying to have Maven resolve a common HTTP library, working with a version control system that I can easily reason about is an attractive prospect indeed.
diff --git a/content/articles/heroku-values.md b/content/articles/heroku-values.md
new file mode 100644
index 000000000..7dc5623b5
--- /dev/null
+++ b/content/articles/heroku-values.md
@@ -0,0 +1,377 @@
+---
+hook: Some of my favorite practices and ideas from almost four years at Heroku.
+image: "/assets/heroku-values/heroku-values.jpg"
+location: San Francisco
+published_at: 2015-11-05T06:20:16Z
+title: My Heroku Values
+hn_link: https://news.ycombinator.com/item?id=14286143
+---
+
+In the spirit of [Adam Wiggins' inspiring list of Heroku
+values][wiggins-values] which was published when he left the company that he
+co-founded, I wanted to publish a list of my own now that I've transitioned
+away.
+
+My time at Heroku was easily the most valuable learning experience of my life,
+and I'll always remember my time there very fondly. I remember upon first
+joining just how disfunctional and inefficient that it made the jobs that I'd
+held previously seem in contrast, and I'm hoping that by putting some of these
+concepts down on paper I'll be able to reference and re-use them in my future
+work.
+
+I suspect that at least some of these ideas might be interesting to even those
+with no relation to the company. Heroku was a place founded and formed by
+people who came from outside the traditional corporate structure, and what
+resulted was a mostly divergent branch of structure compared to mostly anywhere
+else. Even the bad ideas should be novel enough to be intriguing in a mild way
+(not to suggest that there any here of course!).
+
+I should add the caveat that this is a compendium of values from the entire
+duration of my stay at the company; not all had been established when I got
+there, and not all were still in place when I left.
+
+## Technology (#technology)
+
+### The platform (#platform)
+
+One of the greatest pleasures during work at Heroku was the Heroku product
+itself. Apps could be created and deployed in seconds, which encouraged
+innovation by making prototyping easy, and allowed incredibly fast iteration on
+production products. Every company should have a Heroku-like interface for
+their developers to use.
+
+I wouldn't go so far to say that companies should definitively use Heroku, but
+it is a good way to have one without a major investment in infrastructure. As a
+company scales, it might be worth putting a self-hosted one in place like
+Remind has done with [Empire][empire] or Soundcloud has done with
+[Bazooka][bazooka] (PDF warning). GitHub's model of deploying experiments and
+small apps to Heroku and eventually promoting them to more dedicated
+infrastructure (if necessary) is also worthy of note as a pretty nice
+compromise between development agility and performance.
+
+### Dogfooding (#dogfooding)
+
+Continuing from above, we used our own products wherever possible. Every
+production app at the company was deployed on the Heroku platform except for a
+small set of core services that couldn't be. More internal Salesforce apps were
+making their way over every year as well, demonstrating that the idea was
+valuable enough to be organically making its way out and into the much larger
+parent company.
+
+Every internal app that required a login (e.g. the Heroku Dashboard, the Help
+system, the add-ons SSO portal) used the same Heroku OAuth provider that's
+available to third parties, leaving services loosely coupled and easy to build.
+
+Still one of my favorite accomplishments is that Dashboard (the service that
+allows customers to log into a web interface and manage their apps) runs off of
+the same [public V3 API][heroku-api] available to customers. I can't even
+describe the number of bugs uncovered by this technique; bugs that would have
+otherwise been encountered by frustrated customers or third-party developers.
+
+### Twelve factor (#twelve-factor)
+
+[Twelve-factor][twelve-factor] methodology provided a very nice set of guiding
+principles for internal apps so that an engineer could reason about them more
+easily. Every app got its configuration from the environment. Every app had a
+Procfile. Every app emitted logs to standard out.
+
+I've previously read criticism on twelve-factor which postulates that it's an
+artificial set of principles to work around limitations in the platform. I
+don't buy this for a second, but I'll let [Randall Degges cover this
+position][degges-12factor] because he puts it far more succinctly than I ever
+could.
+
+Eventually some of us would wish for and try to develop even stronger
+conventions for building apps (see [service
+conventions](#service-conventions)), but the relatively straightforward set of
+twelve-factor principles got us started and would always act as a solid
+foundation that everyone agreed on.
+
+### The HTTP API design guide (#http-api-design)
+
+A fundamental law of the universe is that every engineer will design an HTTP
+API slightly differently, even if they're being guided by prior art. This isn't
+always a problem, but it's a challenge if you're trying to keep an API cohesive
+when it might be contributed to by dozens of different people. I've seen
+engineers name their new resource `/resource_with_underscores` even though 78
+out of 78 existing resources look like `/resource-with-hyphens`.
+
+We knew that if we wanted a consistent public API, we needed to codify a set of
+opinionated conventions, which is why we wrote the [the HTTP API design
+guide][api-design-guide] based off of the decisions we'd made building the V3
+API. The result is that Heroku's API is one of the most self-consistent HTTP
+APIs that you'll find anywhere in the real world.
+
+### Service conventions (#service-conventions)
+
+Twelve-factor offered some convention when it came to deploying new services,
+but we tried to take standardization much further with our service toolkit
+[Pliny][pliny], which was designed to offer a powerful out-of-the-box stack
+that would be a sane choice for most internal Heroku apps.
+
+The only misstep with regards to Pliny and service conventions is that we
+should have pushed them earlier and harder. Even the basic form in which the
+project exists today took the company a long way in that not every new service
+was a special snowflake of its author's favorite ideas (previously a major
+problem), but we could have gone so much further by putting in automatic
+distribution of updates, more free services (e.g. built-in rate limiting), and
+service discovery and registration. Internal service frameworks are an
+important enough problem that most mid-sized I/P/SaaS companies should have
+dedicated people building them.
+
+### Postgres (#postgres)
+
+If given the opportunity to start a new stack from a blank slate, I might avoid
+some of Heroku's current technological staples (e.g. Ruby). One of the few that
+I _would_ use without a doubt though is Postgres. It's powerful, flexible,
+incredibly stable, and has consistently been a pleasure to work with over the
+years. Having recently had the misfortune to see how other aggrandized database
+software operates in production, I feel that I now have an especially sober
+view of just how good it really is relative to other products on the market.
+
+It's possible that we missed out on some cutting edge technologies that would
+have offered major benefits, but the resources saved by _not_ jumping on every
+data store du jour is incalculable. There's probably still room in Heroku's
+stack for an HA store, but it was the right thing to do to delay the
+introduction of one until a number of mature options were available. In the
+meantime, we got really good at operating Postgres and it was fine for almost
+everything.
+
+The only thing better than Postgres itself was our Heroku Data team (known
+affectionately internally as the DOD, or Department of Data). This team of
+hugely talented engineers saved my skin an untold number of times as I dealt
+[with pretty messy operational problems][postgres-queues] [1]. I was told a
+number of times that as the operator of our largest internal database, I was
+their highest-maintenance customer, and it was true.
+
+### Ephemeralization (#ephemeralization)
+
+One powerful idea was that of _ephemeralization_, which can be roughly
+described as "doing more with less". But aside from doing more, the act of
+reducing the number of moving parts in a system helps to lower its cognitive
+burden and made learning it easier. In a similar vein, picking one true way
+forward from a collection of similar options helps keep engineers productive as
+they move between components.
+
+A few examples:
+
+* Pick and choose single "right" technology stacks from a set of like options.
+ For example, prefer Ruby over Python. More generally, try to focus on _just_
+ Ruby and Go (and Javascript) over the long run.
+* Try to zero in on particular library to perform certain functions. For
+ example, preferring Puma for Ruby HTTP stacks by converting existing installs
+ of Unicorn, or Thin. Using Sequel instead of ActiveRecord.
+* Standardize deployment images so that instead of having individual Chef
+ recipes for every component, all would share only one and be configured
+ purely at the application level.
+* Use a single type of data store consistently. i.e. Postgres.
+* Don't create internal forks of libraries (this one should be obvious, but it
+ doesn't seem to be).
+
+### Use services (#use-services)
+
+Whenever you can use hosted services instead of operating them yourself.
+Although the cost of infrastructure and bringing a new service online is
+usually fairly well-understood, the full personnel costs of maintaining that
+service (i.e. who's going to upgrade it and migrate data a year down the road)
+and retiring it when the time comes are rarely considered.
+
+## Culture (#culture)
+
+### Leadership & inspiration (#inspiration)
+
+I've never had the opportunity to work with so many people who inspired me on
+such a fundamental level as those who I met at Heroku, especially in my early
+days there. The company had everything at one point: great leaders, inspiring
+thinkers, and incredibly ambitious engineers. As someone still relatively
+inexperienced and new to the technological powerhouse that is the Bay Area, my first few
+months felt like a constant assault of new ideas about everything from
+technology to organizational structure. This motivated me to want to build
+great things and made work and the learning I did there all around exciting.
+
+### Self-service (#self-service)
+
+Instead of doing work for someone, give them the tools necessary for them to do
+it for themselves. For example, Heroku's core API service had a private
+administrative branch that employees with a CLI plugin could use to perform
+special actions like re-send a sign-up e-mail. This creates a powerful
+precedent for people to try to do things out for themselves before leaning on
+someone else. If sufficient coverage is reached, this technique helps to
+prevent [constant disruption on open communication channels][slack-distractor]
+so that people have time to work.
+
+### Cross-team contribution (#cross-contribution)
+
+Want a new feature or improvement? Send a pull request for it. There is no
+better way to demonstrate your commitment to an idea. It also had the side
+benefit of giving engineers a wider insight into how the whole machine works by
+forcing them to look beyond the narrow confines of the projects that they might
+maintain day to day.
+
+This obviously doesn't scale to infinity, but it does scale far further than
+many people would have you believe.
+
+### Shipping cadence (#shipping)
+
+We shipped our services fast and frequently, and had framework of tooling to
+make it safe to do so. You'd more often than not see a change go out same day,
+which kept endless possibilities open for shipping new products or improving
+existing ones.
+
+This was also something that had to be discovered at the organization level.
+There was a period in Heroku's history where projects were hard to ship mostly
+due to a weak process for getting them across the finish line. This problem was
+examined and corrected, and today products make it out the door on a regular
+basis.
+
+### Strong engineers (#engineers)
+
+At its essence, this one is pretty obvious: hire good engineering talent.
+
+But things get a little more murky when examining them in closer detail. You of
+course want to look for people who are good at what they do, but it may be even
+more important for them to be flexible enough to jump in and fix bugs or modify
+almost any project. This requires a degree of being able to learn indepedently
+and figure things out for themselves that not everyone is well-suited for, but
+if achieved will result in fewer disruptions to the rest of the team and more
+work output overall. These ideal candidates may not be able to do a good job of
+inverting a binary tree on a whiteboard and may not have a Stanford education
+on their CV, and the interview process may have to be adjusted accordingly to
+accommodate them.
+
+For quite some time we have a team that would sync up once a week and plow
+through huge workloads for the rest of it. Communication happened largely
+asynchronously except for the occasional instance where a higher bandwidth
+channel was more suitable. It was the most productive environment that I've
+ever seen.
+
+### Technical culture (#technical-culture)
+
+Technical culture was fostered, which (I believe) led to a high degree of
+technical excellence in the products that we produced. This mostly manifested
+in the way of papers being passed around, general discussions on the
+engineering mailing list, and plenty of forward-thinking water cooler
+speculation on how to improve products and internal architecture. For a long
+time we also held a technical event every Friday called "Workshop" where
+engineers could show off some of the interesting projects that they were
+working on. It was designed to educate and inspire, and it worked.
+
+### Flexible environment (#flexible-environment)
+
+
+
+ The Agora Collective in Berlin.
+
+
+Traditional organizations generally hold a strong belief that every employee
+should physically punch in at 9 AM, leave it at 5 PM, and keep that up for 5
+days a week year round. At Heroku people would regularly work at home or out of
+the office. It made very little difference to their productivity, but did have
+a profoundly positive effect on their overall happiness. For example, I visited
+my family back in Calgary for weeks at a time two or three times a year, and
+worked from Berlin for roughly three weeks almost every year that I was at the
+company.
+
+This is all possible if a company hires well. If you've got the right people on
+your team, you don't have to keep an eye on them all day because they'll do the
+right things themselves.
+
+### Coffee (#coffee)
+
+Admittedly, this one is a little self-indulgent, but I came to appreciate
+coffee for the first time while at Heroku. For the longest time, there wasn't
+even a coffee machine in the office; just Chemex pots, a grinder, and paper
+filters. The idea was that making coffee would be five to ten minute process,
+during which there would be time to interact with colleagues who happened to
+drop by the area. The system worked.
+
+I learnt how to use both Chemex and AeroPress; both of which I continue to use
+regularly.
+
+## Process & organization (#process)
+
+### GitHub (#github)
+
+
+
+ The OctoTrophy (dodgeball).
+
+
+GitHub has been one of the best pieces of software on the Internet for years,
+and is the right way to organize code and projects. Companies should be using
+tools that developers can extend to optimize their workflows and maximize their
+own their efficiency. With a well-maintained API and healthy ecosystem of
+supporting tooling like [hub][hub] and [ghi][ghi], as well as complementing
+turn key services like Travis, GitHub is one of those tools.
+
+Time that developers *don't spend* supporting custom infrastructure or fighting
+bad tooling is time that can be used to build your product.
+
+### Access to resources (#resources)
+
+If an engineer needed a new resource for a service being deployed, prototype,
+or even one-off experiment, they were at liberty to provision it and keep on
+working, even if that resource wasn't free. Resources here might include
+everything from dynos to run an app, to a Postgres backend, to a few extra EC2
+boxes for deployment to bare metal (relatively speaking). Having Heroku's
+considerable catalog of add-on providers and being completely deployed to AWS
+helped a lot here in that no internal personnel were ever needed to help with
+provisioning.
+
+This practice works because despite a nominal cost to the organization, it
+keeps engineer momentum up and the cost of prototypes down. Hopefully it's
+becoming fairly standard practice in many newer companies these days, but it's
+an easy thing to get wrong. I've previously seen the other side where
+provisioning a job queue is a multi-month process involving endless meetings,
+territorial ops people, and mountains of paperwork. Although some care needs to
+be taken to not shoot from the hip when dropping in new technology, that
+approach doesn't help anyone.
+
+### Total ownership (#total-ownership)
+
+Our own version of "devops", total ownership was meant to convey that a team
+responsible for the development of a component was also responsible for its
+maintenance and production deployment. This added mechanical sympathy has huge
+benefits in that getting features and bug fixes out is faster, manipulating
+production is less esoteric, tasks that require otherwise tricky coordination
+(like data migrations) are easier, and generally resulting in every person
+involved taking more personal responsibility for the product (which leads to
+more uptime).
+
+Total ownership was instrumental in helping me to improve my skill in
+engineering, but I'm still a little on the fence about it. While I don't miss
+the multi-week deployment schedules, I do miss the regular blocks of daily
+focus during which I would never have to stop work and deal with an
+interruption from production.
+
+### Technical management (#management)
+
+When I started at Heroku, my manager knew the codebase better than I did, knew
+Ruby better than I did, and pushed more commits in a day than I would do in a
+week. During our planning sessions we'd sketch in broad strokes on how certain
+features or projects should be implemented, and leave it up to the
+self-initiative of each engineer on the team to fill in the blanks. There
+wasn't the time or the interest for micromanagement.
+
+We eventually moved to a place where a virtuous manager was one who didn't
+commit code, wasn't on the pager rotation, and never looked at a support
+ticket (i.e. probably the situation that most big organizations have). But
+although technical management wasn't an idea that lasted, it was a very good
+place to be an engineer while it did.
+
+[api-design-guide]: https://github.com/interagent/http-api-design
+[bazooka]: http://gotocon.com/dl/goto-zurich-2013/slides/AlexanderSimmerl_and_MattProud_BuildingAnInHouseHeroku.pdf
+[degges-12factor]: http://www.rdegges.com/heroku-isnt-for-idiots/
+[empire]: https://github.com/remind101/empire
+[ghi]: https://github.com/stephencelis/ghi
+[heroku-api]: https://devcenter.heroku.com/articles/platform-api-reference
+[hub]: https://github.com/github/hub
+[maciek]: https://twitter.com/uhoh_itsmaciek
+[pliny]: https://github.com/interagent/pliny
+[postgres-queues]: /postgres-queues
+[slack-distractor]: http://www.guilded.co/blog/2015/08/29/slack-the-ultimate-distractor.html
+[twelve-factor]: http://12factor.net/
+[wiggins-values]: https://gist.github.com/adamwiggins/5687294
+
+[1] Thank-you [Maciek][maciek] in particular for stepping in and helping out
+ with my Postgres woes way more often than you should have.
diff --git a/content/articles/http-transactions.md b/content/articles/http-transactions.md
new file mode 100644
index 000000000..7f1eb4405
--- /dev/null
+++ b/content/articles/http-transactions.md
@@ -0,0 +1,363 @@
+---
+title: Using Atomic Transactions to Power an Idempotent API
+published_at: 2017-09-06T16:00:14Z
+location: San Francisco
+hook: Part one of a series on getting easy data correctness
+ by building APIs on the transactional machinery of
+ Postgres.
+---
+
+The software industry as a whole contains a lot of people
+doing a lot of different things, but for every developer
+working on new embedded firmware, there's about ten
+building the linchpin of modern software -- CRUD apps that
+serve requests over HTTP. A lot of these apps are backed by
+MVC frameworks like Ruby on Rails or ASP.NET, and backed by
+ACID-compliant relational databases like Postgres or SQL
+Server.
+
+Sharp edges in production can lead to all kinds of
+unexpected cases during the execution of an HTTP request --
+client disconnects, application bugs that fail a request
+midway through, and timeouts are all extraordinary
+conditions that will occur regularly given enough request
+volume. Databases can protect applications against
+integrity problems with their transactions, and it's worth
+taking a little time to think about how to make best use of
+them.
+
+There's a surprising symmetry between an HTTP request and a
+database's transaction. Just like the transaction, an HTTP
+request is a transactional unit of work -- it's got a
+clear beginning, end, and result. The client generally
+expects a request to execute atomically and will behave as
+if it will (although that of course varies based on
+implementation). Here we'll look at an example service to
+see how HTTP requests and transactions apply nicely to one
+another.
+
+## The 1:1 Model (#one-to-one)
+
+I'm going to make the case that for a common idempotent
+HTTP request, requests should map to backend transactions
+at 1:1. For every request, all operations are committed or
+aborted as part of a single transaction within it.
+
+!fig src="/assets/http-transactions/http-transactions.svg" caption="Transactions (tx1, tx2, tx3) mapped to HTTP requests at a 1:1 ratio."
+
+At first glance requiring idempotency may sound like a
+sizeable caveat, but in many APIs operations can be made to
+be idempotent by massaging endpoint verbs and behavior, and
+moving non-idempotent operations like network calls to
+background jobs.
+
+Some APIs can't be made idempotent and those will need a
+little extra consideration. We'll look at what to do about
+them in more detail later as a follow up to this article.
+
+## A simple user creation service (#create-user)
+
+Let's build a simple test service with a single "create
+user" endpoint. A client hits it with an `email` parameter,
+and the endpoint responds with status `201 Created` to
+signal that the user's been created. The endpoint is also
+idempotent so that if a client hits the endpoint again with
+the same parameter, it responds with status `200 OK` to
+signal that everything is still fine.
+
+```
+PUT /users?email=jane@example.com
+```
+
+On the backend, we're going to do three things:
+
+1. Check if the user already exists, and if so, break and
+ do nothing.
+2. Insert a new record for the user.
+3. Insert a new "user action" record. It'll serve as an
+ audit log which comes with a reference to a user's ID,
+ an action name, and a timestamp.
+
+We'll build our implementation with Postgres, Ruby, and an
+ORM in the style of ActiveRecord or Sequel, but these
+concepts apply beyond any specific technology.
+
+### Database schema (#database-schema)
+
+The service defines a simple Postgres schema containing
+tables for its users and user actions [1]:
+
+``` sql
+CREATE TABLE users (
+ id BIGSERIAL PRIMARY KEY,
+ email TEXT NOT NULL CHECK (char_length(email) <= 255)
+);
+
+-- our "user action" audit log
+CREATE TABLE user_actions (
+ id BIGSERIAL PRIMARY KEY,
+ user_id BIGINT NOT NULL REFERENCES users (id),
+ action TEXT NOT NULL CHECK (char_length(action) < 100),
+ occurred_at TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+```
+
+### Backend implementation (#implementation)
+
+The server route checks to see if the user exists. If so,
+it returns immediately. If not, it creates the user and
+user action, and returns. In both cases, the transaction
+commits successfully.
+
+``` ruby
+put "/users/:email" do |email|
+ DB.transaction(isolation: :serializable) do
+ user = User.find(email)
+ halt(200, 'User exists') unless user.nil?
+
+ # create the user
+ user = User.create(email: email)
+
+ # create the user action
+ UserAction.create(user_id: user.id, action: 'created')
+
+ # pass back a successful response
+ [201, 'User created']
+ end
+end
+```
+
+The SQL that's generated in the case of a successful
+insertion looks roughly like:
+
+``` sql
+START TRANSACTION
+ ISOLATION LEVEL SERIALIZABLE;
+
+SELECT * FROM users
+ WHERE email = 'jane@example.com';
+
+INSERT INTO users (email)
+ VALUES ('jane@example.com');
+
+INSERT INTO user_actions (user_id, action)
+ VALUES (1, 'created');
+
+COMMIT;
+```
+
+## Concurrency protection (#concurrency-protection)
+
+Readers with sharp eyes may have noticed a potential
+problem: our `users` table doesn't have a `UNIQUE`
+constraint on its `email` column. The lack of one could
+potentially allow two interleaved transactions to run their
+`SELECT` phase one concurrently and get empty results.
+They'd both follow up with an `INSERT`, leaving a
+duplicated row.
+
+!fig src="/assets/http-transactions/concurrent-race.svg" caption="A data race causing two concurrent HTTP requests to insert the same row."
+
+Luckily, in this example we've used an even more powerful
+mechanism than `UNIQUE` to protect our data's correctness.
+Invoking our transaction with `DB.transaction(isolation:
+:serializable)` starts it in `SERIALIZABLE`; an isolation
+level so powerful that its guarantees might seem
+practically magical. It emulates serial transaction
+execution as if each outstanding transaction had been
+executed one after the other, rather than concurrently. In
+cases like the above where a race condition would have
+caused one transaction to taint the results of another, one
+of the two will fail to commit with a message like this
+one:
+
+```
+ERROR: could not serialize access due to read/write dependencies among transactions
+DETAIL: Reason code: Canceled on identification as a pivot, during commit attempt.
+HINT: The transaction might succeed if retried.
+```
+
+We're not going to look into how `SERIALIZABLE` works, but
+sufficed to say it may detect a number of different data
+races for us, and if it does it'll abort a transaction when
+it tries to commit.
+
+### Retrying an abort (#abort-retry)
+
+Even though in our example a race should be rare, we'd
+prefer to handle it correctly in our application code so
+that it doesn't bubble up as a 500 to a client. This is
+possible by wrapping the request's core operations in a
+loop:
+
+``` ruby
+MAX_ATTEMPTS = 2
+
+put "/users/:email" do |email|
+ MAX_ATTEMPTS.times do
+ begin
+ DB.transaction(isolation: :serializable) do
+ ...
+ end
+
+ # Success! Leave the loop.
+ break
+
+ rescue Sequel::SerializationFailure
+ log.error "Failed to commit serially: #{$!}"
+ # Failure: fall through to the next loop.
+ end
+ end
+end
+```
+
+In this case, we might have more than one of the same
+transaction mapped to the HTTP request like so:
+
+!fig src="/assets/http-transactions/transaction-retry.svg" caption="An aborted transaction being retried within the same request."
+
+These loops will be more expensive than usual, but again,
+we're protecting ourselves against an unusual race. In
+practice, unless callers are particularly contentious,
+they'll rarely occur.
+
+Gems like [Sequel][sequel] can handle this for you
+automatically (this code will behave similarly to the loop
+above):
+
+``` ruby
+DB.transaction(isolation: :serializable,
+ retry_on: [Sequel::SerializationFailure]) do
+ ...
+end
+```
+
+### Data protection in layers (#layers)
+
+I've taken the opportunity to demonstrate the power of a
+serializable transaction, but in real life you'd want to
+put in a `UNIQUE` constraint on `email` even if you
+intended to use the serializable isolation level. Although
+`SERIALIZABLE` will protect you from a duplicate insert, an
+added `UNIQUE` will act as one more check to protect your
+application against incorrectly invoked transactions or
+buggy code. It's worth having it in there.
+
+## Background jobs (#background-jobs)
+
+It's a common pattern to add jobs to a background queue
+during an HTTP request so that they can be worked
+out-of-band and a waiting client doesn't have to block on
+an expensive operation.
+
+Let's add one more step to our user service above. In
+addition to creating user and user action records, we'll
+also make an API request to an external support service to
+tell it that a new account's been created. We'll do that by
+queuing a background job because there's no reason that it
+has to happen in-band with the request.
+
+``` ruby
+put "/users/:email" do |email|
+ DB.transaction(isolation: :serializable) do
+ ...
+
+ # enqueue a job to tell an external support service
+ # that a new user's been created
+ enqueue(:create_user_in_support_service, email: email)
+
+ ...
+ end
+end
+```
+
+If we used a common job queue like Sidekiq to do this work,
+then in the case of a transaction rollback (like we talked
+about above where two transactions conflict), we could end
+up with an invalid job in the queue. It's referencing data
+that no longer exists, so no matter how many times job
+workers retried it, it can never succeed.
+
+### Transaction-staged jobs (#staged-jobs)
+
+A way around this is to create a job staging table into our
+database. Instead of sending jobs to the queue directly,
+they're sent to a staging table first, and an ***enqueuer***
+pulls them out in batches and puts them to the job queue.
+
+``` sql
+CREATE TABLE staged_jobs (
+ id BIGSERIAL PRIMARY KEY,
+ job_name TEXT NOT NULL,
+ job_args JSONB NOT NULL
+);
+```
+
+The enqueuer selects jobs, enqueues them, and then removes
+them from the staging table [2]. Here's a rough
+implementation:
+
+``` ruby
+loop do
+ DB.transaction do
+ # pull jobs in large batches
+ job_batch = StagedJobs.order('id').limit(1000)
+
+ if job_batch.count > 0
+ # insert each one into the real job queue
+ job_batch.each do |job|
+ Sidekiq.enqueue(job.job_name, *job.job_args)
+ end
+
+ # and in the same transaction remove these records
+ StagedJobs.where('id <= ?', job_batch.last).delete
+ end
+ end
+end
+```
+
+Because jobs are inserted into the staging table from
+within a transaction, its _isolation_ property (ACID's "I")
+guarantees that they're not visible to any other
+transaction until after the inserting transaction commits.
+A staged job that's rolled back is never seen by the
+enqueuer, and doesn't make it to the job queue.
+
+I call this pattern a [_transactionally-staged job
+drain_](/job-drain).
+
+It's also possible to just put the job queue directly in
+the database itself with a library like [Que], but [because
+bloat can be potentially dangerous in systems like
+Postgres][queues], this probably isn't as good of an idea.
+
+## Non-idempotent requests (#non-idempotent-requests)
+
+What we've covered here works nicely for HTTP requests that
+are idempotent. That's probably a healthy majority given a
+well-designed API, but there are always going to be some
+endpoints that are not idempotent. Examples include calling
+out to an external payment gateway with a credit card,
+requesting a server to be provisioned, or anything else
+that needs to make a synchronous network request.
+
+For these types of requests we're going to need to build
+something a little more sophisticated, but just like in
+this simpler case, our database has us covered. In part two
+of this series we'll look at how to implement [idempotency
+keys][idempotency] on top of multi-stage transactions.
+
+[1] Note that for the purposes of this simple example we
+could probably make this SQL more succinct, but for good
+hygiene, we use length check, `NOT NULL`, and foreign key
+constraints on our fields even if it's a little more noisy
+visually.
+
+[2] Recall that like many job queues, the "enqueuer" system
+shown guarantees "at least once" rather than "exactly once"
+semantics, so the job themselves must be idempotent.
+
+[idempotency]: https://stripe.com/blog/idempotency
+[que]: https://github.com/chanks/que
+[queues]: /postgres-queues
+[sequel]: https://github.com/jeremyevans/sequel
diff --git a/content/articles/idempotency-keys.md b/content/articles/idempotency-keys.md
new file mode 100644
index 000000000..8665186be
--- /dev/null
+++ b/content/articles/idempotency-keys.md
@@ -0,0 +1,1010 @@
+---
+title: Implementing Stripe-like Idempotency Keys in Postgres
+published_at: 2017-10-27T13:52:12Z
+location: San Francisco
+hook: Building resilient services by identifying foreign
+ state mutations and grouping local changes into
+ restartable atomic phases so that every request can be
+ driven to completion.
+hn_link: https://news.ycombinator.com/item?id=15569478
+---
+
+In APIs ***idempotency*** is a powerful concept. An
+idempotent endpoint is one that can be called any number of
+times while guaranteeing that the side effects will occur
+only once. In a messy world where clients and servers that
+may occasionally crash or have their connections drop
+partway through a request, it's a huge help in making
+systems more robust to failure. Clients that are uncertain
+whether a request succeeded or failed can simply keep
+retrying it until they get a definitive response.
+
+As we're about to see in this article, implementing a
+server so that all requests to it are perfectly idempotent
+isn't always easy. For endpoints that get away with only
+mutating local state in an ACID database, it's possible to
+get a robust and simple idempotency implementation by
+mapping requests to transactions which I wrote about [in
+more detail a few weeks ago](/http-transactions). This
+approach is far easier and less complicated than what's
+described here, and I'd suggest that anyone who can get
+away with it take that path.
+
+## Idempotency with keys (#keys)
+
+Implementations that need to make synchronous changes in
+foreign state (i.e. outside of a local ACID store) are
+somewhat more difficult to design. A basic example of this
+is if an app needs to make a Stripe request to create a charge and
+needs to know in-band whether it went through so that it
+can decide whether to proffer some good or service. To
+guarantee idempotency on this type of endpoint we'll need
+to introduce ***idempotency keys***.
+
+An idempotency key is a unique value that's generated by a
+client and sent to an API along with a request. The server
+stores the key to use for bookkeeping the status of that
+request on its end. If a request should fail partway
+through, the client retries with _the same_ idempotency key
+value, and the server uses it to look up the request's
+state and continue from where it left off. The name
+"idempotency key" [comes from Stripe's
+API][stripeidempotency].
+
+A common way to transmit an idempotency key is through an
+HTTP header:
+
+``` sh
+POST /v1/charges
+
+...
+Idempotency-Key: 0ccb7813-e63d-4377-93c5-476cb93038f3
+...
+
+amount=1000¤cy=usd
+```
+
+Once the server knows that a request has definitively
+finished by either succeeding or failing in a way that's
+not recoverable, it stores the request's results and
+associates them with the idempotency key. If a client makes
+another request with the same key, the server simply short
+circuits and returns the stored results.
+
+Keys are not meant to be used as a permanent request
+archive but rather as a mechanism for ensuring near-term
+correctness. Servers should recycle them out of the system
+beyond a horizon where they won't be of much use -- say 24
+hours or so.
+
+## Rocket Rides (#rocket-rides)
+
+Let's look at how to design idempotency keys for an API by
+building a reference implementation.
+
+Our great dev relations team at Stripe has built an app
+called [Rocket Rides][rocketrides] to demonstrate the use
+of the Connect platform and other interesting parts of the
+API. In Rocket Rides, users who are in a hurry share a ride
+with a jetpack-certified pilot to get where they're going
+_fast_. SOMA's gridlock traffic disappears into the
+distance as they soar free through virgin skies. Travel can
+be a little more risky than Lyft, so make sure to pack an
+extra parachute.
+
+
+
+The [Rocket Rides repository][rocketrides] comes with a
+simple server implementation, but software tends to grow
+with time, so to be more representative of what a real
+service with 15 engineers and half a dozen product owners
+would look like, we're going to complicate things with a
+few embellishments.
+
+### The request lifecycle (#lifecycle)
+
+When a new rides comes in we'll perform this set of
+operations:
+
+1. Insert an idempotency key record.
+2. Create a ride record to track the ride that's about to
+ happen.
+3. Create an audit record referencing the ride.
+4. **Make an API call to Stripe to charge the user for the
+ ride** (here we're leaving our own stack, and this
+ presents some risk).
+5. Update the ride record with the created charge ID.
+6. Send the user a receipt via email.
+7. Update idempotency key with results.
+
+!fig src="/assets/idempotency-keys/api-request.svg" caption="A typical API request to our embellished Rocket Rides backend."
+
+Our backend implementation will be called from the Rocket
+Rides mobile app with an idempotency key. If a request
+fails, the app will continue retrying the operation with
+the same key, and our job as backend implementers is to
+make sure that's safe. We'll be charging users' credit
+cards as part of the request, and we absolutely can't take
+the risk of charging them twice.
+
+### The entropy of production (#failure)
+
+Most of the time we can expect every one of our Rocket Rides
+API calls to go swimmingly, and every operation will succeed
+without a problem. However, when we reach the scale of
+thousands of API calls a day, we'll start to notice a few
+problems appearing here and there; requests failing due to
+poor cellular connectivity, API calls to Stripe failing
+occasionally, or bad turbulence caused by moving at
+supersonic speeds periodically knocking users offline.
+After we reach the scale of millions of API calls a day,
+basic probability will dictate that we'll be seeing these
+sorts of things happening all the time.
+
+Let's look at a few examples of things that can go wrong:
+
+* Inserting the idempotency key or ride record could fail
+ due to a constraint violation or a database connectivity
+ problem.
+* Our call to Stripe could time out, leaving it unclear
+ whether our charge when through or not.
+* Contacting Mailgun to send the receipt could fail,
+ leaving the user with a credit card charge but no formal
+ notification of the transaction.
+* The client could disconnect as they're transmitting a
+ request to the server, cancelling the operation midway
+ through.
+
+Now that we have a premise in place, let's introduce some
+ideas that will let us elegantly solve this problem.
+
+## Foreign state mutations (#foreign-state)
+
+To shore up our backend, it's key to identify where we're
+making ***foreign state mutations***; that is, calling out
+and manipulating data on another system. This might be
+creating a charge on Stripe, adding a DNS record, or
+sending an email.
+
+Some foreign state mutations are idempotent by nature (e.g.
+adding a DNS record), some are not idempotent but can be
+made idempotent with the help of an idempotency key (e.g.
+charge on Stripe, sending an email), and some operations
+are not idempotent, most often because a foreign service
+hasn't designed them that way and doesn't provide a
+mechanism like an idempotency key.
+
+The reason that the local vs. foreign distinction matters
+is that unlike a local set of operations where we can
+leverage an ACID store to roll back a result that we didn't
+like, once we make our first foreign state mutation, we're
+committed one way or another [1]. **We've pushed data into a
+system beyond our own boundaries and we shouldn't lose
+track of it.**
+
+### Between any two systems (#two-systems)
+
+We're using an API call to Stripe as a common example, but
+remember that even foreign calls within your own
+infrastructure count! It's tempting to treat emitting
+records to Kafka as part of atomic operations because they
+have such a high success rate that they feel like they are.
+They're not, and should be treated like any other fallible
+foreign state mutation.
+
+## Atomic phases (#atomic-phases)
+
+An ***atomic phase*** is a set of local state mutations
+that occur in transactions _between_ foreign state
+mutations. We say that they're atomic because we can use an
+ACID-compliant database like Postgres to guarantee that
+either all of them will occur, or none will.
+
+Atomic phases should be safely committed _before_
+initiating any foreign state mutation. If the call fails,
+our local state will still have a record of it happening
+that we can use to retry the operation.
+
+### Recovery points (#recovery-points)
+
+A ***recovery point*** is a name of a check point that we
+get to after having successfully executed any atomic phase
+_or_ foreign state mutation. Its purpose is to allow a
+request that's being retried to jump back to the point in
+the lifecycle just before the last attempt failed.
+
+For convenience, we're going to store the name of the
+recovery point reached right onto the idempotency key
+relation that we'll build. All requests will initially get
+a recovery point of `started`, and after any request is
+complete (again, through either a success or definitive
+error) it'll be assigned a recovery point of `finished`.
+When in an atomic phase, the transition to a new recovery
+point should be committed as part of that phase's
+transaction.
+
+### Background jobs and staging (#background-jobs)
+
+In-band foreign state mutations make a request slower and
+more difficult to reason about, so they should be avoided
+when possible. In many cases it's possible to defer this
+type of work to after the request is complete by sending it
+to a background job queue.
+
+In our Rocket Rides example the charge to Stripe probably
+_can't_ be deferred -- we want to know whether it succeeded
+right away so that we can deny the request if it didn't.
+Sending an email _can_ and should be sent to the
+background.
+
+By using a [_transactionally-staged job
+drain_](/job-drain), we can hide jobs from workers until
+we've confirmed that they're ready to be worked by
+isolating them in a transaction. This also means that the
+background work becomes part of an atomic phase and greatly
+simplifies its operational properties. Work should always
+be offloaded to background queues wherever possible.
+
+## Hardening Rocket Rides for interstellar travel (#interstellar)
+
+Now that we've covered a few key concepts, we're ready to
+shore up Rocket Rides so that it's resilient against any
+kind of failure imaginable. Let's put together the basic
+schema, break the lifecycle up into atomic phases, and
+assemble a simple implementation that will recover from
+failures.
+
+A working version (with testing) of all of this is
+available in the [_Atomic Rocket Rides_][atomicrides]
+repository. It might be easier to download that code and
+follow along.
+
+``` sh
+git clone https://github.com/brandur/rocket-rides-atomic.git
+```
+
+### The idempotency key relation (#idempotency-key)
+
+Let's design a Postgres schema for idempotency keys in our
+app:
+
+``` sql
+CREATE TABLE idempotency_keys (
+ id BIGSERIAL PRIMARY KEY,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+ idempotency_key TEXT NOT NULL
+ CHECK (char_length(idempotency_key) <= 100),
+ last_run_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+ locked_at TIMESTAMPTZ DEFAULT now(),
+
+ -- parameters of the incoming request
+ request_method TEXT NOT NULL
+ CHECK (char_length(request_method) <= 10),
+ request_params JSONB NOT NULL,
+ request_path TEXT NOT NULL
+ CHECK (char_length(request_path) <= 100),
+
+ -- for finished requests, stored status code and body
+ response_code INT NULL,
+ response_body JSONB NULL,
+
+ recovery_point TEXT NOT NULL
+ CHECK (char_length(recovery_point) <= 50),
+ user_id BIGINT NOT NULL
+);
+
+CREATE UNIQUE INDEX idempotency_keys_user_id_idempotency_key
+ ON idempotency_keys (user_id, idempotency_key);
+```
+
+There are a few notable fields here:
+
+* `idempotency_key`: This is the user-specified idempotency
+ key. It's good practice to send something with good
+ randomness like a UUID, but not necessarily required. We
+ constrain the field's length so that nobody sends us
+ anything too exotic.
+
+ We've made `idempotency_key` unique, but across
+ `(user_id, idempotency_key)` so that it's possible to
+ have the same idempotency key for different requests as
+ long as it's across different user accounts.
+
+* `locked_at`: A field that indicates whether this
+ idempotency key is actively being worked. The first API
+ request that creates the key will lock it automatically,
+ but subsequent retries will also set it to make sure that
+ they're the only request doing the work.
+
+* `params`: The input parameters of the request. This is
+ stored mostly so that we can error if the user sends two
+ requests with the same idempotency key but with different
+ parameters, but can also be used for our own backend to
+ push unfinished requests to completion (see [the
+ completionist](#completionist) below).
+
+* `recovery_point`: A text label for the last phase
+ completed for the idempotent request (see [recovery
+ points](#recovery-points) above). Gets an initial value
+ of `started` and is set to `finished` when the request is
+ considered to be complete.
+
+### Other schema (#other-schema)
+
+Recall our target API lifecycle for Rocket Rides from
+above.
+
+!fig src="/assets/idempotency-keys/api-request.svg" caption="A typical API request to our embellished Rocket Rides backend."
+
+Let's bring up Postgres relations for everything else we'll
+need to build this app including audit records, rides, and
+users. Given that we aim to maximize reliability, we'll try
+to follow database best practices and use `NOT NULL`,
+unique, and foreign key constraints wherever we can.
+
+``` sql
+--
+-- A relation to hold records for every user of our app.
+--
+CREATE TABLE users (
+ id BIGSERIAL PRIMARY KEY,
+ email TEXT NOT NULL UNIQUE
+ CHECK (char_length(email) <= 255),
+
+ -- Stripe customer record with an active credit card
+ stripe_customer_id TEXT NOT NULL UNIQUE
+ CHECK (char_length(stripe_customer_id) <= 50)
+);
+
+--
+-- Now that we have a users table, add a foreign key
+-- constraint to idempotency_keys which we created above.
+--
+ALTER TABLE idempotency_keys
+ ADD CONSTRAINT idempotency_keys_user_id_fkey
+ FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE RESTRICT;
+
+--
+-- A relation that hold audit records that can help us piece
+-- together exactly what happened in a request if necessary
+-- after the fact. It can also, for example, be used to
+-- drive internal security programs tasked with looking for
+-- suspicious activity.
+--
+CREATE TABLE audit_records (
+ id BIGSERIAL PRIMARY KEY,
+
+ -- action taken, for example "created"
+ action TEXT NOT NULL
+ CHECK (char_length(action) <= 50),
+
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+ data JSONB NOT NULL,
+ origin_ip CIDR NOT NULL,
+
+ -- resource ID and type, for example "ride" ID 123
+ resource_id BIGINT NOT NULL,
+ resource_type TEXT NOT NULL
+ CHECK (char_length(resource_type) <= 50),
+
+ user_id BIGINT NOT NULL
+ REFERENCES users ON DELETE RESTRICT
+);
+
+--
+-- A relation representing a single ride by a user.
+-- Notably, it holds the ID of a successful charge to
+-- Stripe after we have one.
+--
+CREATE TABLE rides (
+ id BIGSERIAL PRIMARY KEY,
+ created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
+
+ -- Store a reference to the idempotency key so that we can recover an
+ -- already-created ride. Note that idempotency keys are not stored
+ -- permanently, so make sure to SET NULL when a referenced key is being
+ -- reaped.
+ idempotency_key_id BIGINT UNIQUE
+ REFERENCES idempotency_keys ON DELETE SET NULL,
+
+ -- origin and destination latitudes and longitudes
+ origin_lat NUMERIC(13, 10) NOT NULL,
+ origin_lon NUMERIC(13, 10) NOT NULL,
+ target_lat NUMERIC(13, 10) NOT NULL,
+ target_lon NUMERIC(13, 10) NOT NULL,
+
+ -- ID of Stripe charge like ch_123; NULL until we have one
+ stripe_charge_id TEXT UNIQUE
+ CHECK (char_length(stripe_charge_id) <= 50),
+
+ user_id BIGINT NOT NULL
+ REFERENCES users ON DELETE RESTRICT
+);
+
+CREATE INDEX rides_idempotency_key_id
+ ON rides (idempotency_key_id)
+ WHERE idempotency_key_id IS NOT NULL;
+
+--
+-- A relation that holds our transactionally-staged jobs
+-- (see "Background jobs and job staging" above).
+--
+CREATE TABLE staged_jobs (
+ id BIGSERIAL PRIMARY KEY,
+ job_name TEXT NOT NULL,
+ job_args JSONB NOT NULL
+);
+```
+
+### Designing atomic phases (#rocket-rides-phases)
+
+Now that we've got a feel for what our data should look
+like, let's break the API request into distinct atomic
+phases. These are the basic rules for identifying them:
+
+1. Upserting the idempotency key gets its own atomic phase.
+2. Every foreign state mutation gets its own atomic phase.
+3. After those phases have been identified, ***all other
+ operations between*** them are grouped into atomic
+ phases. Even if there are 100 operations against an ACID
+ database between two foreign state mutations, they can
+ all safely belong to the same phase.
+
+So in our example, we have an atomic phase for inserting
+the idempotency key (`tx1`) and other for making our charge
+call to Stripe (`tx3`) and storing the result. Every other
+operation around `tx1` and `tx3` gets grouped together and
+becomes part of two more phases, `tx2` and `tx4`. `tx2`
+through `tx4` can each be reached by a recovery point
+that's set by the transaction that committed before it
+(`started`, `ride_created`, and `charge_created`).
+
+!fig src="/assets/idempotency-keys/atomic-phases.svg" caption="API request to Rocket Rides broken into foreign state mutations and atomic phases."
+
+### Atomic phase implementation (#atomic-phase-implementation)
+
+Our implementation for an atomic phase will wrap everything
+in a transaction block (note we're using Ruby, but this
+same concept is possible in any language) and give each
+phase three options for what it can return:
+
+1. A `RecoveryPoint` which sets a new recovery point. This
+ happens within the same transaction as the rest of the
+ phase so it's all guaranteed to be atomic. Execution
+ continues normally into the next phase.
+2. A `Response` which sets the idempotent request's
+ recovery point to `finished` and returns a response to
+ the user. This should be used as part of the normal
+ success condition, but can also be used to return early
+ with a non-recoverable error. Say for example that a
+ user's credit card is not valid -- no matter how many
+ times the request is retried, it will never go through.
+3. A `NoOp` which indicates that program flow should
+ continue, but that neither a recovery point nor response
+ should be set.
+
+Don't worry about parsing the specific code too much, but
+here's what it might look like:
+
+``` ruby
+def atomic_phase(key, &block)
+ error = false
+ begin
+ DB.transaction(isolation: :serializable) do
+ ret = block.call
+
+ if ret.is_a?(NoOp) || ret.is_a?(RecoveryPoint) || ret.is_a?(Response)
+ ret.call(key)
+ else
+ raise "Blocks to #atomic_phase should return one of " \
+ "NoOp, RecoveryPoint, or Response"
+ end
+ end
+ rescue Sequel::SerializationFailure
+ # you could possibly retry this error instead
+ error = true
+ halt 409, JSON.generate(wrap_error(Messages.error_retry))
+ rescue
+ error = true
+ halt 500, JSON.generate(wrap_error(Messages.error_internal))
+ ensure
+ # If we're leaving under an error condition, try to unlock the idempotency
+ # key right away so that another request can try again.
+ if error && !key.nil?
+ begin
+ key.update(locked_at: nil)
+ rescue StandardError
+ # We're already inside an error condition, so swallow any additional
+ # errors from here and just send them to logs.
+ puts "Failed to unlock key #{key.id}."
+ end
+ end
+ end
+end
+
+# Represents an action to perform a no-op. One possible option for a return
+# from an #atomic_phase block.
+class NoOp
+ def call(_key)
+ # no-op
+ end
+end
+
+# Represents an action to set a new recovery point. One possible option for a
+# return from an #atomic_phase block.
+class RecoveryPoint
+ attr_accessor :name
+
+ def initialize(name)
+ self.name = name
+ end
+
+ def call(key)
+ raise ArgumentError, "key must be provided" if key.nil?
+ key.update(recovery_point: name)
+ end
+end
+
+# Represents an action to set a new API response (which will be stored onto an
+# idempotency key). One possible option for a return from an #atomic_phase
+# block.
+class Response
+ attr_accessor :data
+ attr_accessor :status
+
+ def initialize(status, data)
+ self.status = status
+ self.data = data
+ end
+
+ def call(key)
+ raise ArgumentError, "key must be provided" if key.nil?
+ key.update(
+ locked_at: nil,
+ recovery_point: RECOVERY_POINT_FINISHED,
+ response_code: status,
+ response_body: data
+ )
+ end
+end
+
+```
+
+In the case of a serialization error, we return a `409
+Conflict` because that almost certainly means that a
+concurrent request conflicted with what we were trying to
+do. In a real app, you probably want to just retry the
+operation right away because there's a good chance it will
+succeed this time.
+
+For other errors we return a `500 Internal Server Error`.
+For either type of error, we try to unlock the idempotency
+key before finishing so that another request has a chance
+to retry with it.
+
+### Idempotency key upsert (#upserting-key-upsert)
+
+When a new idempotency key value comes into the API, we're
+going to create or update a corresponding row that we'll
+use to track its progress.
+
+The easiest case is if we've never seen the key before. If
+so, just insert a new row with appropriate values.
+
+If we have seen the key, lock it so that no other
+requests that might be operating concurrently also try the
+operation. If the key was already locked, return a `409
+Conflict` to indicate that to the user.
+
+A key that's already set to `finished` is simply allowed to
+fall through and have its response return on the standard
+success path. We'll see that in just a moment.
+
+``` ruby
+key = nil
+
+atomic_phase(key) do
+ key = IdempotencyKey.first(user_id: user.id, idempotency_key: key_val)
+
+ if key
+ # Programs sending multiple requests with different parameters but the
+ # same idempotency key is a bug.
+ if key.request_params != params
+ halt 409, JSON.generate(wrap_error(Messages.error_params_mismatch))
+ end
+
+ # Only acquire a lock if the key is unlocked or its lock as expired
+ # because it was long enough ago.
+ if key.locked_at && key.locked_at > Time.now - IDEMPOTENCY_KEY_LOCK_TIMEOUT
+ halt 409, JSON.generate(wrap_error(Messages.error_request_in_progress))
+ end
+
+ # Lock the key and update latest run unless the request is already
+ # finished.
+ if key.recovery_point != RECOVERY_POINT_FINISHED
+ key.update(last_run_at: Time.now, locked_at: Time.now)
+ end
+ else
+ key = IdempotencyKey.create(
+ idempotency_key: key_val,
+ locked_at: Time.now,
+ recovery_point: RECOVERY_POINT_STARTED,
+ request_method: request.request_method,
+ request_params: Sequel.pg_jsonb(params),
+ request_path: request.path_info,
+ user_id: user.id,
+ )
+ end
+
+ # no response and no need to set a recovery point
+ NoOp.new
+end
+```
+
+At first glance this code might not look like it's safe
+from having two concurrent requests come in in close
+succession and try to the lock the same key, but it is
+because the atomic phase is wrapped in a `SERIALIZABLE`
+transaction. If two different transactions both try to lock
+any one key, one of them will be aborted by Postgres.
+
+### A directed and acyclic state machine (#acyclic-state-machine)
+
+We're going to implement the rest of the API request as a
+simple state machines whose states are a [directed acyclic
+graph (DAG)][dag]. Unlike a normal graph, a DAG moves only
+in one direction and never cycles back on itself.
+
+Each atomic phase will be activated from a recovery point,
+which was either read from a recovered idempotency key, or
+set by the previous atomic phase. We continue to move
+through phases until reaching a `finished` state, upon
+which the loop is broken and a response is sent back to the
+user.
+
+An idempotency key that was already finished will enter the
+loop, break immediately, and send back whatever response
+was stored onto it.
+
+``` ruby
+loop do
+ case key.recovery_point
+ when RECOVERY_POINT_STARTED
+ atomic_phase(key) do
+ ...
+ end
+
+ when RECOVERY_POINT_RIDE_CREATED
+ atomic_phase(key) do
+ ...
+ end
+
+ when RECOVERY_POINT_CHARGE_CREATED
+ atomic_phase(key) do
+ ....
+ end
+
+ when RECOVERY_POINT_FINISHED
+ break
+
+ else
+ raise "Bug! Unhandled recovery point '#{key.recovery_point}'."
+ end
+
+ # If we got here, allow the loop to move us onto the next phase of the
+ # request. Finished requests will break the loop.
+end
+
+[key.response_code, JSON.generate(key.response_body)]
+```
+
+### Initial bookkeeping (#initial-bookkeeping)
+
+The second phase (`tx2` in the diagram above) is simple:
+create a record for the ride in our local database, insert
+an audit record, and set a new recovery point to
+`ride_created`.
+
+``` ruby
+atomic_phase(key) do
+ ride = Ride.create(
+ idempotency_key_id: key.id,
+ origin_lat: params["origin_lat"],
+ origin_lon: params["origin_lon"],
+ target_lat: params["target_lat"],
+ target_lon: params["target_lon"],
+ stripe_charge_id: nil, # no charge created yet
+ user_id: user.id,
+ )
+
+ # in the same transaction insert an audit record for what happened
+ AuditRecord.insert(
+ action: AUDIT_RIDE_CREATED,
+ data: Sequel.pg_jsonb(params),
+ origin_ip: request.ip,
+ resource_id: ride.id,
+ resource_type: "ride",
+ user_id: user.id,
+ )
+
+ RecoveryPoint.new(RECOVERY_POINT_RIDE_CREATED)
+end
+```
+
+### Calling Stripe (#calling-stripe)
+
+With basic records in place, it's time to try our foreign
+state mutation by trying to charge the customer via Stripe.
+Here we initiate a charge for $20 using a Stripe customer
+ID that was already stored on their user record. On
+success, update the ride created in the last step with the
+new Stripe charge ID and set recovery point
+`charge_created`.
+
+``` ruby
+atomic_phase(key) do
+ # retrieve a ride record if necessary (i.e. we're recovering)
+ ride = Ride.first(idempotency_key_id: key.id) if ride.nil?
+
+ # if ride is still nil by this point, we have a bug
+ raise "Bug! Should have ride for key at #{RECOVERY_POINT_RIDE_CREATED}." \
+ if ride.nil?
+
+ raise "Simulated failed with `raise_error` param." if raise_error
+
+ # Rocket Rides is still a new service, so during our prototype phase
+ # we're going to give $20 fixed-cost rides to everyone, regardless of
+ # distance. We'll implement a better algorithm later to better
+ # represent the cost in time and jetfuel on the part of our pilots.
+ begin
+ charge = Stripe::Charge.create({
+ amount: 20_00,
+ currency: "usd",
+ customer: user.stripe_customer_id,
+ description: "Charge for ride #{ride.id}",
+ }, {
+ # Pass through our own unique ID rather than the value
+ # transmitted to us so that we can guarantee uniqueness to Stripe
+ # across all Rocket Rides accounts.
+ idempotency_key: "rocket-rides-atomic-#{key.id}"
+ })
+ rescue Stripe::CardError
+ # Sets the response on the key and short circuits execution by
+ # sending execution right to 'finished'.
+ Response.new(402, wrap_error(Messages.error_payment(error: $!.message)))
+ rescue Stripe::StripeError
+ Response.new(503, wrap_error(Messages.error_payment_generic))
+ else
+ ride.update(stripe_charge_id: charge.id)
+ RecoveryPoint.new(RECOVERY_POINT_CHARGE_CREATED)
+ end
+end
+```
+
+The call to Stripe produces a few possibilities for
+unrecoverable errors (i.e. an error that no matter how many
+times is retried will never see the call succeed). If we
+run into one, set the request to `finished` and return an
+appropriate response. This might occur if the credit card
+was invalid or the transaction was otherwise declined by
+the payment gateway.
+
+### Send receipt and finish (#send-receipt-finish)
+
+Now that our charge has been persisted, the next step is to
+send a receipt to the user. Making an external mail call
+would normally require its own foreign state mutation, but
+because we're using a transactionally-staged job drain, we
+get a guarantee that the operation commits along with the
+rest of the transaction.
+
+``` ruby
+atomic_phase(key) do
+ StagedJob.insert(
+ job_name: "send_ride_receipt",
+ job_args: Sequel.pg_jsonb({
+ amount: 20_00,
+ currency: "usd",
+ user_id: user.id
+ })
+ )
+ Response.new(201, wrap_ok(Messages.ok))
+end
+```
+
+The final step is to set a response telling the user that
+everything worked as expected. We're done!
+
+## Other processes (#other-processes)
+
+Besides the web process running the API, a few others are
+needed to make everything work (see [_Atomic Rocket Ride_'s
+`Procfile`][atomicridesproc] for the full list and the
+corresponding implementations in the same repository).
+
+### The enqueuer (#enqueuer)
+
+There should be an ***enqueuer*** that moves jobs from
+`staged_jobs` to the job queue after their inserting
+transaction has committed. See [this article][jobdrain] for
+details on how to build one, or [the
+implementation][enqueuer] from _Atomic Rocket Rides_.
+
+### The completer (#completer)
+
+One problem with this implementation is we're reliant on
+clients to push indeterminate requests (for example, one
+that might have appeared to be a timeout) to completion.
+Usually clients are willing to do this because they want to
+see their requests go through, but there can be cases where
+a client starts working, never quite finishes, and drops
+forever.
+
+A stretch goal is to implement a ***completer***. Its only
+job is to find requests that look like they never finished
+to satisfaction and which it looks like clients have
+dropped, and push through to completion.
+
+It doesn't even have to have special knowledge about how
+the stack is implemented. It just needs to know how to read
+idempotency keys and have a specialized internal
+authentication path that allows it to retry anyone's
+request.
+
+See the _Atomic Rocket Rides_ repository for [a completer
+implementation][completer].
+
+### The reaper (#reaper)
+
+Idempotency keys are meant to act as a mechanism for
+guaranteeing idempotence, and not as a permanent archive of
+historical requests. After some amount of time a
+***reaper*** process should go through keys and delete
+them.
+
+I'd suggest a threshold of about 72 hours so that even if a
+bug is deployed on Friday that errors a large number of
+valid requests, an app could still keep a record of them
+throughout the weekend and onto Monday where a developer
+would have a chance to commit a fix and have the completer
+push them through to success.
+
+An ideal reaper might even notice requests that could not
+be finished successfully and try to do some cleanup on
+them. If cleanup is difficult or impossible, it should put
+them in a list somewhere so that a human can find out what
+failed.
+
+See the _Atomic Rocket Rides_ repository for [a reaper
+implementation][reaper].
+
+## Murphy in action (#murphys-law)
+
+Now that we have all the pieces in place, let's assume the
+truth of [Murphy's Law][murphyslaw] and imagine some
+scenarios that could go wrong while a client app is talking
+to the new _Atomic Rocket Rides_ backend:
+
+* *The client makes a request, but the connection breaks
+ before it reaches the backend:* The client, having used
+ an idempotency key, knows that retries are safe and so
+ retries. The next attempt succeeds.
+
+* *Two requests try to create an idempotency key at the
+ same time:* A `UNIQUE` constraint in the database
+ guarantees that only one request can succeed. One goes
+ through, and the other gets a `409 Conflict`.
+
+* *An idempotency key is created, but the database goes
+ down and it fails soon after:* The client continues to
+ retry against the API until it comes back online. Once it
+ does, the created key is recovered and the request is
+ continued.
+
+* *Stripe is down:* The atomic phase containing the Stripe
+ request fails, and the API responds with an error that
+ tells the client to retry. They continue to do so until
+ Stripe comes back online and the charge succeeds.
+
+* *A server process dies while waiting for a response from
+ Stripe:* Luckily, the call to Stripe was also made with
+ its own idempotency key. The client retries and a new
+ call to Stripe is invoked with the same key. Stripe's own
+ idempotency guarantees ensure that we haven't
+ double-charged our user.
+
+* *A bad deploy 500s all requests midway through:*
+ Developers scramble and deploy a fix for the bug. After
+ it's out, clients retry and the original requests succeed
+ along the newly bug-free path. If the fix took so long to
+ get out that clients have long since gone away, then the
+ completer process pushes them through.
+
+Our care around implementing a failsafe design has paid off
+-- the system is safe despite a wide variety of possible
+failures.
+
+## Complications (#complications)
+
+### Non-idempotent foreign state mutations (#non-idempotent)
+
+If we know that a foreign state mutation is an idempotent
+operation or it supports an idempotency key (like Stripe
+does), we know that it's safe to retry any failures that we
+see.
+
+Unfortunately, not every service will make this guarantee.
+If we try to make a non-idempotent foreign state mutation
+and we see a failure, we may have to persist this operation
+as permanently errored. In many cases we won't know whether
+it's safe to retry or not, and we'll have to take the
+conservative route and fail the operation.
+
+The exception is if we got an error back from the
+non-idempotent API, but one that tell us explicitly that
+it's okay to retry. Indeterminate errors like a connection
+reset or timeout will have to be marked as failed.
+
+This is why you should implement idempotency and/or
+idempotency keys on all your services!
+
+### Non-ACID data stores (#non-acid)
+
+It's worth mentioning that none of this is possible on a
+non-ACID store like MongoDB. Without transactional
+semantics a database can't ever guarantee that any two
+operations commit atomically -- _every_ operation against
+your database becomes equivalent to a foreign state
+mutation because the notion of an atomic phase is
+impossible.
+
+## Beyond APIs (#beyond-apis)
+
+This article focuses heavily on APIs, but note that this
+same technique is reusable for other software as well. A
+common problem in web apps is double form submission. A
+user clicking the "Submit" button twice in quick succession
+may initiate two separate HTTP calls, and in cases where
+submissions have non-idempotent side effects (e.g. charging
+the user) this is a problem.
+
+When rendering the form initially, we can add a `` to it that contains an idempotency key.
+This value will stay the same across multiple submissions,
+and the server can use it to dedup the request.
+
+## Cultivating passive safety (#passive-safety)
+
+API backends should aim to be *passively safe* -- no matter
+what kind of failures are thrown at them they'll end up in
+a stable state, and users are never left broken even in the
+most extreme cases. From there, active mechanisms can drive
+the system towards perfect cohesion. Ideally, human
+operators never have to intervene to fix things (or at
+least as infrequently as possible).
+
+[Purely idempotent transactions](/http-transactions) and
+the idempotency keys with atomic phases described here are
+two ways to move in that direction. Failures are not only
+understood to be possible, but are expected, and enough
+thought has been applied to the system's design that we
+know it'll tolerate failure cleanly no matter what happens.
+
+[1] There is one caveat that it may be possible to
+implement [two-phase commit][2pc] between a system and all
+other systems where it performs foreign state mutations.
+This would allow distributed rollbacks, but is complex and
+time-consuming enough to implement that it's rarely seen
+with any kind of ubiquity in real software environments.
+
+[2pc]: https://en.wikipedia.org/wiki/Two-phase_commit_protocol
+[atomicrides]: https://github.com/brandur/rocket-rides-atomic
+[atomicridesproc]: https://github.com/brandur/rocket-rides-atomic/blob/master/Procfile
+[completer]: https://github.com/brandur/rocket-rides-atomic/blob/master/completer.rb
+[dag]: https://en.wikipedia.org/wiki/Directed_acyclic_graph
+[enqueuer]: https://github.com/brandur/rocket-rides-atomic/blob/master/enqueuer.rb
+[jobdrain]: /job-drain
+[murphyslaw]: https://en.wikipedia.org/wiki/Murphy%27s_law
+[reaper]: https://github.com/brandur/rocket-rides-atomic/blob/master/reaper.rb
+[rocketrides]: https://github.com/stripe/stripe-connect-rocketrides
+[stripeidempotency]: https://stripe.com/blog/idempotency
diff --git a/content/articles/interfaces.md b/content/articles/interfaces.md
new file mode 100644
index 000000000..9add4cc9d
--- /dev/null
+++ b/content/articles/interfaces.md
@@ -0,0 +1,358 @@
+---
+title: Learning From Terminals to Design the Future of User Interfaces
+published_at: 2017-01-28T13:24:00Z
+location: San Francisco
+hook: How we overvalue the wrong technology and novel
+ aspects of interface design at the expense of substantial
+ gains to our productivity.
+hn_link: https://news.ycombinator.com/item?id=13733777
+---
+
+I was recently called out on Twitter for claiming that
+Electron-based Slack, with three teams configured,
+regularly takes 30+ seconds to load. They claimed that I
+was either committing gross hyperbole, or the victim of
+some localized problem. I responded by sending over a video
+of me opening Slack and loading each of my teams in
+succession. It was 45 seconds long. _My_ claim is that this
+sort of loading time isn't unusual at all. It's just that
+we're all used to it.
+
+
+
+
+
+ This is a video of me waiting for Slack
+ configured with three teams to fully load. It's 45
+ seconds long.
+
+
+Modern applications and interfaces frustrate me. In today's
+world every one of us has the awesome power of the greatest
+computers in human history in our pockets and at our desks.
+The computational capacity at our finger tips would have
+been unimaginable even to the most audacious thinkers of
+thirty years ago.
+
+These powerful devices should be propelling our workflows
+forward with us gangly humans left barely able to keep up,
+and yet, almost without exception we wait for our computers
+instead of the other way around. We're conditioned
+ourselves to think that waiting 30+ seconds for an app to
+load, or interrupting our workflow to watch a half second
+animations a thousand times a day, are perfectly normal.
+
+## The rise of the web (#rise-of-the-web)
+
+
+
+
+
+ Yahoo circa 1995.
+
+
+Somewhere around the late 90s or early 00s we made the
+decision to jump ship from desktop apps and start writing
+the lion's share of new software for the web. This was
+largely for pragmatic reasons: the infrastructure to talk
+to a remote server became possible for the first time, good
+cross platform UI frameworks had always been elusive beasts
+[1], and desktop development frameworks were intimidating
+compared to more approachable languages like Perl and PHP.
+
+The other reason was cosmetic: HTML and CSS gave developers
+total visual control over what their interfaces looked
+like, allowing them to brand them and build experiences
+that were pixel-perfect according to their own ends. This
+seemed like a big improvement over more limiting desktop
+development, but it led us to the world we have today
+where every interface is a different size and shape, and
+the common display conventions that we used to have to aid
+with usability have become distant memories of the past.
+
+Today, web apps are still being hailed as the future. With
+the possible exception of mobile, most software companies
+are building their products for the web, and even when
+they're not, web technology is considered a reasonable
+alternative for the desktop. Vocal groups proclaim that
+Electron-based apps convey huge benefits compared to
+traditional options in productivity and flexibility, and
+are the way forward for all desktop software.
+
+I'm not on a mission to demean this technology, but as it's
+continually augmented with ever more unwieldy retrofits,
+there's a widening disparity between what we can build with
+it compared to the best-written native apps. Software on
+the web today takes too long to load, depends too heavily
+on synchronous calls to slow networks, overemphasizes
+visual gimmickry, and lacks the refinement that allows
+mastery by more experienced users to gain huge leverage for
+productivity's sake.
+
+## The worst kept secret (#worst-kept-secret)
+
+In 2007, after releasing the iPhone, Steve Jobs told
+developers that they could all write apps for the iPhone
+_today_ ... as long as they did it in HTML5. To his credit,
+he reversed his position inside a year after realizing how
+compromised the web experience was compared to native
+options.
+
+In 2012, Mark Zuckerberg ignited JavaScript proponents
+everywhere after announcing that Facebook's biggest mobile
+mistake was focusing on HTML5. Meanwhile, consumers
+everywhere celebrated as they were given a native app that
+was far faster and more responsive.
+
+Every one of us knows that when it comes to a smartphone,
+we'd use a native app over an in-browser HTML5 any day of
+the week. Yet when it comes to the desktop, we're still
+using Gmail, Reddit, Trello, and JIRA. Computers and
+networks tend to be fast enough that this software is "good
+enough". Tellingly though, we tend to avoid this software
+whenever better options are available, like with our
+terminals and text editors.
+
+## Not just technology (#not-just-technology)
+
+Web technology isn't conducive to fast and efficient UIs,
+but that's not the only problem we're facing. Somewhere
+along the way UX designers became addicted to catchy, but
+superfluous, interface effects.
+
+Think of all the animations that an average user sits
+through in a day: switching between spaces in Mac OS,
+1Password's unlock, waiting for iOS to show the SpringBoard
+after hitting the home button, entering full screen from a
+Mac OS app, or switching between tabs in mobile Safari.
+
+
+
+
+
+ 1Password's unlock animation. The stuttering
+ isn't a problem with the video on this page; it's
+ actually how the animation looks.
+
+
+
+
+
+
+ OS X Spaces, introduced in Leopard. A
+ nominally useful feature, but the mandatory animations
+ make them slow and unwieldy.
+
+
+I liked every one of them the first time. The next five
+thousand times were less impressive. And the same goes for
+all the flourishes in this class -- they look great in
+screenshots and demos, but don't advance our ability to be
+productive; in fact, they do the opposite.
+
+
+
+
+
+ Will Cmd + Tab be the next victim of
+ overzealous animation?
+
+
+I live in fear that one day Apple will realize that they've
+left a gaping hole in their UX strategy and that task
+switches from Cmd + Tab should be animated. Multiply that
+animation's length by the average number of task switches
+per day by the number of users by their cost per second,
+and you'd be able to see that millions of dollars a year in
+global productivity has evaporated overnight.
+
+Animations are a particularly egregious visual gimmick, but
+there are others: whitespace so extravagant that only a
+minute amount of content can fit on the screen, overly
+large font sizes, submenus where a row of links would do
+just as well, unlabeled icons that look neat but leave
+their users guessing as to what they do, fixed headers that
+obscure content. The list goes on.
+
+## ThemWare (#themware)
+
+
+
+
+
+ Contrary to any "modern" interfaces, a
+ terminal is fast and responsive. There are no animations
+ or other superfluous visual baggage.
+
+
+Many of us developers are building web applications for
+other people while simultaneously eschewing them ourselves
+as much as we possibly can. While our users move at glacial
+speeds through pages on the web, we're sitting in terminal
+environments that aren't just fast, but come with the
+promise of incredible advancements in productivity to
+anyone willing to spend the time to master them.
+
+Here's why I like using terminals and terminal programs:
+
+* Startup/loading time is negligible.
+
+* Time to transition between different screens is
+ instantaneous (no animations in sight).
+
+* Interface elements are limited, but uniform.
+
+* The learning curve is steep, but rewarding. They're
+ optimized for the experienced user rather than the first
+ timer. Given that successfully onboarded users may spend
+ tens of thousands of hours in the UI over the course of
+ their lifetimes, this is just good sense.
+
+* Composability: I'm far from a zealot singing the praises
+ of the Unix philosophy, but _most_ terminal apps produce
+ output that I can process in some way to get into another
+ program. It could be way better, but it's leaps and
+ bounds over what I can do on the desktop. Even copying
+ text out of a modern web app can be a tricky proposition
+ if HTML elements aren't nested optimally.
+
+
+
+
+
+ Modern UIs have next to zero composability.
+ Even copying text can be a tricky proposition.
+
+
+
+## The principles of interface design (#interface-principles)
+
+If you ask a web designer about the elements of practical
+design in interfaces today (I say _practical_ to
+disambiguate from vague design tenets like [Dieter Rams'
+ten principles of good design][dieter-rams]), they'd talk
+to you about text legibility, intuitiveness, and
+whitespace. I'd argue that we're optimizing for the wrong
+things. UIs that are pretty and friendly are nice to have,
+but the true values of a good interface should be speed
+and efficiency to make their users as productive as
+possible.
+
+Let's dig into it by looking at the aspirational interface
+concept from a great movie: _Minority Report_. [Here's a
+video][minority-report] of it in action.
+
+
+
+ A futuristic and unrealistic concept interface:
+ the computer waits on the human instead of the human
+ waiting on the computer.
+
+
+I think we can all agree that the interface of this
+prospective future is incredible and desirable, but if we
+drill into it, what's its most amazing aspect?
+
+Years ago, I might have said that it was the wafer thin
+screens. Or the incredible touch technology. But we have
+both of those things now! In fact, what we have today is
+_better_; we can display more than two colors on screen!
+Far superior to anything they seem to have in Philip K.
+Dick's dystopian future.
+
+Today, by far the most amazing aspect is that it's an
+interface that's keeping up to its user. Instead of waiting
+on the computer to think about some text completion, show
+him an animation because he's switching an app, or start up
+a program, it's keeping up with everything he tells it do
+in real time. The computer waits on the human rather than
+the other way around. Besides terminals and a few other
+pieces of fringe technology, modern UIs don't even come
+close to a future this fantastic.
+
+A successful interface isn't one that looks good in a still
+screenshot, it's one that maximizes our productivity and
+lets us _keep moving_. Legibility and whitespace are great,
+but they're of vanishing unimportance compared to speed and
+responsiveness.
+
+## The road ahead (#the-road-ahead)
+
+Neither a terminal nor today's web apps are what the future
+should look like, but the terminal is closer.
+
+Unfortunately, terminals also _suck_. Although better than
+the alternative in many ways, they've failed to keep up
+with any advancements from the last thirty odd years.
+Here's a few places where terminals could stand to be
+inspired by web technology:
+
+* Rich media elements: images, videos, tabulated results,
+ etc. The terminal has needed an answer to these since
+ 1985, but still doesn't have one.
+
+* Fonts. Monospace is the best family of fonts for
+ programming, but is objectively terrible for reading. We
+ should be able to mix fonts within a single terminal
+ interface for optimal legibility.
+
+* Whitespace and line-height: used in moderation, these do
+ help make UI elements more distinctive and text more
+ legible.
+
+Terminals also need a lot of other things before they're
+ever going to a plausible interface replacement for most
+people. UI elements that aren't built around ASCII bar
+characters for example.
+
+We need a reboot. We need toolkits that produce interfaces
+that are fast, consistent, bug free, and composable _by
+default_ so that good interfaces aren't just something
+produced by the best developer/designers in the world, but
+could be reasonably expected from even junior people in the
+industry.
+
+We should be honest with ourselves and call out design
+anti-patterns that promote flashiness at the expense of
+efficiency.
+
+We should stop babying our users and try to raise beginners
+and the less technical to the bar of modern day power users
+rather than produce software that's designed for the lowest
+common denominator. We need more applications like Vim,
+Emacs, and Irssi that push their users to improve and pay
+huge dividends to those who are willing to make the effort,
+and we need to train people to use them.
+
+We should build networked applications that cache content
+and make network fetches asynchronously to remote APIs so
+that humans aren't waiting for data to come back over the
+wire while they're working.
+
+There's a future out there where our software makes
+everything from filing a bug to paying off your credit card
+fast and efficient, but the path that we're on today isn't
+it.
+
+[1] Fans of Qt (and maybe others) will vehemently disagree
+ that there's never been a good cross platform UI
+ library. I'd argue that SDKs like Qt were never quite
+ accessible enough and never produced good enough
+ results to be suitable for universal adoption.
+
+[dieter-rams]: https://www.vitsoe.com/us/about/good-design
+[minority-report]: https://www.youtube.com/watch?v=PJqbivkm0Ms
diff --git a/content/articles/job-drain.md b/content/articles/job-drain.md
new file mode 100644
index 000000000..c658bab58
--- /dev/null
+++ b/content/articles/job-drain.md
@@ -0,0 +1,188 @@
+---
+title: Transactionally Staged Job Drains in Postgres
+published_at: 2017-09-20T14:58:14Z
+location: Calgary
+hook: Building a robust background worker system that
+ leverages transactional isolation to never work a job too
+ early, and transactional durability to never let one drop.
+hn_link: https://news.ycombinator.com/item?id=15294722
+---
+
+Background jobs are one of the most common patterns in web
+programming, and for good reason. Slow API calls and other
+heavy lifting is deferred to out-of-band workers so that a
+user's request is executed as quickly as possible. In web
+services, fast is a feature.
+
+But when it comes to working with background jobs in
+conjunction with ACID transactions of the likes you'd find
+in Postgres, MySQL, or SQL Server, there are a few sharp
+edges that aren't immediately obvious. To demonstrate,
+let's take a simple workflow that starts a transaction,
+executes a few DB operations, and queues a job somewhere in
+the middle:
+
+``` ruby
+DB.transaction do |t|
+ db_op1(t)
+ queue_job()
+ db_op2(t)
+end
+```
+
+It's not easy to spot, but if your queue is fast, the job
+enqueued by `queue_job()` is likely to fail. A worker
+starts running it before its enclosing transaction is
+committed, and it fails to access data that it expected to
+be available.
+
+As an easy example, imagine `db_op1()` inserts a user
+record. `queue_job()` puts a job in the queue to retrieve
+that record, and add that user's email address (along with
+a unique internal ID) to an email whitelist managed by
+another service. A background worker dequeues the job, but
+finds that the user record it's looking for is nowhere to
+be found in the database.
+
+!fig src="/assets/job-drain/job-failure.svg" caption="A job failing because the data it relies on is not yet committed."
+
+A related problem are transaction rollbacks. In these cases
+data is discarded completely, and jobs inserted into the
+queue will _never_ succeed no matter how many times they're
+retried.
+
+## For every complex problem ... (#complex-problem)
+
+Sidekiq has [a FAQ on this exact subject][sidekiq]:
+
+> _Why am I seeing a lot of "Can't find ModelName with
+> ID=12345" errors with Sidekiq?_
+>
+> Your client is creating the Model instance within a
+> transaction and pushing a job to Sidekiq. Sidekiq is
+> trying to execute your job before the transaction has
+> actually committed. Use Rails's `after_commit :on =>
+> :create` hook or move the job creation outside of the
+> transaction block.
+
+Not to pick on Sidekiq in particular (you can find similar
+answers and implementations all over the web), but this
+solution solves one problem only to introduce another.
+
+If you queue a job _after_ a transaction is committed, you
+run the risk of your program crashing after the commit, but
+before the job makes it to the queue. Data is persisted, but
+the background work doesn't get done. It's a problem that's
+less common than the one Sidekiq is addressing, but one
+that's far more nefarious; you almost certainly won't
+notice when it happens.
+
+Other common solutions are equally as bad. For example,
+another well-worn pattern is to allow the job's first few
+tries to fail, and rely on the queue's retry scheme to
+eventually push the work through at some point after the
+transaction has committed. The downsides of this
+implementation is that it thrashes needlessly (lots of
+wasted work is done) and throws a lot of unnecessary
+errors.
+
+## Transactions as gates (#transactions-as-gates)
+
+We can dequeue jobs gracefully by using a
+_transactionally-staged job drain_.
+
+With this pattern, jobs aren't immediately sent to the job
+queue. Instead, they're staged in a table within the
+relational database itself, and the ACID properties of the
+running transaction keep them invisible until they're ready
+to be worked. A secondary ***enqueuer*** process reads the
+table and sends any jobs it finds to the job queue before
+removing their rows.
+
+Here's some sample DDL for what a `staged_jobs` table might
+look like:
+
+``` sql
+CREATE TABLE staged_jobs (
+ id BIGSERIAL PRIMARY KEY,
+ job_name TEXT NOT NULL,
+ job_args JSONB NOT NULL
+);
+```
+
+And here's what a simple enqueuer implementation that sends
+jobs through to Sidekiq:
+
+``` ruby
+# Only one enqueuer should be running at any given time.
+acquire_lock(:enqueuer) do
+
+ loop do
+ # Need at least repeatable read isolation level so that our DELETE after
+ # enqueueing will see the same jobs as the original SELECT.
+ DB.transaction(isolation_level: :repeatable_read) do
+ jobs = StagedJob.order(:id).limit(BATCH_SIZE)
+
+ unless jobs.empty?
+ jobs.each do |job|
+ Sidekiq.enqueue(job.job_name, *job.job_args)
+ end
+
+ StagedJob.where(Sequel.lit("id <= ?", jobs.last.id)).delete
+ end
+ end
+
+ # If `staged_jobs` was empty, sleep for some time so
+ # we're not continuously hammering the database with
+ # no-ops.
+ sleep_with_exponential_backoff
+ end
+
+end
+```
+
+Transactional isolation means that the enqueuer is unable
+to see jobs that aren't yet commmitted (even if they've
+been inserted into `staged_jobs` by an uncommitted
+transaction), so jobs are never worked too early.
+
+!fig src="/assets/job-drain/transaction-isolation.svg" caption="Jobs are invisible to the enqueuer until their transaction is committed."
+
+It's similarly protected against rollbacks. If a job is
+inserted within a transaction that's subsequently
+discarded, the job is discarded with it.
+
+The enqueuer is also totally resistant to job loss. Jobs
+are only removed _after_ they're successfully transmitted
+to the queue, so even if the worker dies partway through,
+it will pick back up again and send along any jobs that it
+missed. _At least once_ delivery semantics are guaranteed.
+
+!fig src="/assets/job-drain/job-drain.svg" caption="Jobs being sequestered in a staging table and enqueued when they're ready to be worked."
+
+## Advantages over in-database queues (#in-database-queues)
+
+[Delayed_job][delayedjob], [que][que], and
+[queue_classic][queueclassic] use a similar transactional
+mechanic to keep jobs hidden, and take it even a step
+further by having workers dequeue jobs directly from within
+the database.
+
+This is workable at modest to medium scale, but the frantic
+pace at which workers try to lock jobs doesn't scale very
+well for a database that's experiencing considerable load.
+For Postgres in particular, [long-running
+transactions](/postgres-queues) greatly increase the amount
+of time it takes for workers to find a job that they can
+lock, and this can lead to the job queue spiraling out of
+control.
+
+The transactionally-staged job drain avoids this problem by
+selecting primed jobs in bulk and feeding them into another
+store like Redis that's better-suited for distributing jobs
+to competing workers.
+
+[delayedjob]: https://github.com/collectiveidea/delayed_job
+[que]: https://github.com/chanks/que
+[queueclassic]: https://github.com/QueueClassic/queue_classic
+[sidekiq]: https://github.com/mperham/sidekiq/wiki/FAQ#why-am-i-seeing-a-lot-of-cant-find-modelname-with-id12345-errors-with-sidekiq
diff --git a/content/articles/kinesis-by-example.md b/content/articles/kinesis-by-example.md
new file mode 100644
index 000000000..38845feca
--- /dev/null
+++ b/content/articles/kinesis-by-example.md
@@ -0,0 +1,258 @@
+---
+hook: Splitting and merging in action.
+location: San Francisco
+published_at: 2015-03-19T07:27:45Z
+title: Kinesis Shard Splitting & Merging by Example
+---
+
+The Kinesis developer guide covers shard [splitting and merging from a high-level](http://docs.aws.amazon.com/kinesis/latest/dev/kinesis-using-sdk-java-resharding.html), but I find that it's occasionally helpful to help solidify these types of advanced topics with examples. Here we'll walk through what the most basic splitting and merging operations look like on a Kinesis stream to get a better feel for the concepts.
+
+First of all, I start out with a stream called `split-merge-test` that has a single shard. It's come online and is in an `ACTIVE` state:
+
+``` sh
+$ aws kinesis describe-stream --stream-name split-merge-test
+{
+ "StreamDescription": {
+ "StreamStatus": "ACTIVE",
+ "StreamName": "split-merge-test",
+ "StreamARN": "arn:aws:kinesis:us-east-1:551639669466:stream/split-merge-test",
+ "Shards": [
+ {
+ "ShardId": "shardId-000000000000",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "0"
+ },
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859072970256769156879668947671610661756289899560962"
+ }
+ }
+ ]
+ }
+}
+```
+
+Note above that the shard has a `HashKeyRange` assigned to it that starts at zero and ends at `340282366920938463463374607431768211455`. When a record is sent into a Kinesis stream, a basic hash function is applied to its partition key. The result of this function maps the record to one of the stream's shards based on the hash key range that each shard handles. A stream's total capacity is increased by subdividing the hash key range on an existing shard so that it maps to more shards than it did before.
+
+## Splitting (#splitting)
+
+Splitting a shard is a manual process in that an operator must decide how to divide up its total hash space between the new shards that will be created. I've decided to split mine evenly between two new shards, so I perform some basic arithmetic on my `EndingHashKey` to find the halfway point between it and zero:
+
+``` ruby
+$ irb
+irb(main):001:0> 340282366920938463463374607431768211455 / 2
+=> 170141183460469231731687303715884105727
+```
+
+Now that we have our answer, let's proceed to perform the split:
+
+``` sh
+$ aws kinesis split-shard --stream-name split-merge-test \
+ --shard-to-split shardId-000000000000 \
+ --new-starting-hash-key 170141183460469231731687303715884105727
+```
+
+The stream goes into `UPDATING` status. The shards look as they did before because the work to change their topology is still in-progress:
+
+``` sh
+$ aws kinesis describe-stream --stream-name split-merge-test
+{
+ "StreamDescription": {
+ "StreamStatus": "UPDATING",
+ "StreamName": "split-merge-test",
+ "StreamARN": "arn:aws:kinesis:us-east-1:551639669466:stream/split-merge-test",
+ "Shards": [
+ {
+ "ShardId": "shardId-000000000000",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "0"
+ },
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859072970256769156879668947671610661756289899560962"
+ }
+ }
+ ]
+ }
+}
+```
+
+A few seconds later, we can see the results of our changes. There are a few key things to note below:
+
+* The hash key range of shards is **immutable**. When we split a shard, our "parent" is still available but has now entered a state called `CLOSED` (`shardId-000000000000` in this example). Its entire hash key range has been taken over by its children, `shardId-000000000001` and `shardId-000000000002`. A `CLOSED` shard is easily identifiable by the presence of an `EndingSequenceNumber`.
+* The stream is once again `ACTIVE` now that updates are finished.
+* Child shards have a pointer called `ParentShardId` back to the parent that they split from so that some history is maintained.
+* The stream's sequence number jumped quite a bit during the split, by about 10^48 in fact. This is slightly less impressive when you take into account that the sequence jumps by about 10^24 between two normal record insertions, but this is quite a bit bigger than even that.
+
+``` sh
+$ aws kinesis describe-stream --stream-name split-merge-test
+{
+ "StreamDescription": {
+ "StreamStatus": "ACTIVE",
+ "StreamName": "split-merge-test",
+ "StreamARN": "arn:aws:kinesis:us-east-1:551639669466:stream/split-merge-test",
+ "Shards": [
+ {
+ "ShardId": "shardId-000000000000",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "0"
+ },
+ "SequenceNumberRange": {
+ "EndingSequenceNumber": "49548859072981407141756144980517230543978492779512725506",
+ "StartingSequenceNumber": "49548859072970256769156879668947671610661756289899560962"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000001",
+ "HashKeyRange": {
+ "EndingHashKey": "170141183460469231731687303715884105726",
+ "StartingHashKey": "0"
+ },
+ "ParentShardId": "shardId-000000000000",
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859213219643322715968606065803827347328807764754450"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000002",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "170141183460469231731687303715884105727"
+ },
+ "ParentShardId": "shardId-000000000000",
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859213241944067914499229207339545619977169270734882"
+ }
+ }
+ ]
+ }
+}
+```
+
+It may also be worth pointing out that although `shardId-000000000000` is considered to be `CLOSED` now, as the last records that it contains leave Kinesis' retention window it will transition from `CLOSED` to `EXPIRED`. When it does, no further records can ever be retrieved from the shard.
+
+## Merging (#merging)
+
+Now let's see what happens when we merge the two shards back together. A merge operation takes two shards as parameters: (1) the main shard to merge, and (2) the adjacent shard that will be mixed into it. Note that the use of the word "adjacent" here is not an accident; because of the way that Kinesis shards handle hash key ranges, only two shards that handle ranges that are contiguous can be merged back together.
+
+``` sh
+$ aws kinesis merge-shards --stream-name split-merge-test \
+ --shard-to-merge shardId-000000000001 \
+ --adjacent-shard-to-merge shardId-000000000002
+```
+
+As before, our stream enters `UPDATING`, but does not yet reflect our changes:
+
+``` sh
+$ aws kinesis describe-stream --stream-name split-merge-test
+{
+ "StreamDescription": {
+ "StreamStatus": "UPDATING",
+ "StreamName": "split-merge-test",
+ "StreamARN": "arn:aws:kinesis:us-east-1:551639669466:stream/split-merge-test",
+ "Shards": [
+ {
+ "ShardId": "shardId-000000000000",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "0"
+ },
+ "SequenceNumberRange": {
+ "EndingSequenceNumber": "49548859072981407141756144980517230543978492779512725506",
+ "StartingSequenceNumber": "49548859072970256769156879668947671610661756289899560962"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000001",
+ "HashKeyRange": {
+ "EndingHashKey": "170141183460469231731687303715884105726",
+ "StartingHashKey": "0"
+ },
+ "ParentShardId": "shardId-000000000000",
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859213219643322715968606065803827347328807764754450"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000002",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "170141183460469231731687303715884105727"
+ },
+ "ParentShardId": "shardId-000000000000",
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859213241944067914499229207339545619977169270734882"
+ }
+ }
+ ]
+ }
+}
+```
+
+And finally the stream re-enters its `ACTIVE` state with our new merged shard. It's worth pointing out that:
+
+* Like before with our split, closed shards `shardId-000000000001` and `shardId-000000000002` are still around, but now have an `EndingSequenceNumber` to indicate that they are closed.
+* The new shard `shardId-000000000003` remembers its history. It points back to its `ParentShardId`, as well as the `AdjacentParentShardID` that also helped to derive it.
+
+``` sh
+$ aws kinesis describe-stream --stream-name split-merge-test
+{
+ "StreamDescription": {
+ "StreamStatus": "ACTIVE",
+ "StreamName": "split-merge-test",
+ "StreamARN": "arn:aws:kinesis:us-east-1:551639669466:stream/split-merge-test",
+ "Shards": [
+ {
+ "ShardId": "shardId-000000000000",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "0"
+ },
+ "SequenceNumberRange": {
+ "EndingSequenceNumber": "49548859072981407141756144980517230543978492779512725506",
+ "StartingSequenceNumber": "49548859072970256769156879668947671610661756289899560962"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000001",
+ "HashKeyRange": {
+ "EndingHashKey": "170141183460469231731687303715884105726",
+ "StartingHashKey": "0"
+ },
+ "ParentShardId": "shardId-000000000000",
+ "SequenceNumberRange": {
+ "EndingSequenceNumber": "49548859213230793695315233917635362760664090379986927634",
+ "StartingSequenceNumber": "49548859213219643322715968606065803827347328807764754450"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000002",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "170141183460469231731687303715884105727"
+ },
+ "ParentShardId": "shardId-000000000000",
+ "SequenceNumberRange": {
+ "EndingSequenceNumber": "49548859213253094440513764540776898478936738741492908066",
+ "StartingSequenceNumber": "49548859213241944067914499229207339545619977169270734882"
+ }
+ },
+ {
+ "ShardId": "shardId-000000000003",
+ "HashKeyRange": {
+ "EndingHashKey": "340282366920938463463374607431768211455",
+ "StartingHashKey": "0"
+ },
+ "ParentShardId": "shardId-000000000001",
+ "AdjacentParentShardId": "shardId-000000000002",
+ "SequenceNumberRange": {
+ "StartingSequenceNumber": "49548859483727682580892427312894066474572005964670566450"
+ }
+ }
+ ]
+ }
+}
+```
+
+Further splits and merges will all follow same pattern, leaving behind a trail of dead shards that act as a historical record to follow the lifecycle of the stream. The reason behind this design might not be immediately obvious, but sufficed to say that the immutable property of a shard's hash range is important in helping to guarantee that records can be consumed in-order even across a merge or split. We'll leave a more detailed explanation on this topic to a future article.
diff --git a/content/articles/kinesis-in-production.md b/content/articles/kinesis-in-production.md
new file mode 100644
index 000000000..f2a644b7d
--- /dev/null
+++ b/content/articles/kinesis-in-production.md
@@ -0,0 +1,95 @@
+---
+hook: A short write-up on findings, limitations, and opinion on Kinesis after a month
+ in production.
+location: San Francisco
+published_at: 2015-05-03T17:35:37Z
+title: A Month of Kinesis in Production
+---
+
+We've been powering a production component with Kinesis for a little over a month now so it seems like as good of a time as ever to put together a few thoughts on how it's worked out. My goal here is to put together a few short objective observations on how it's performed, followed by what I perceive as the product's major limitations, and then a short opinion as to whether I'd use it again. Keep in mind though that while we're putting a bit of load on our stream, we haven't come close to pushing the product to its limits (well, except for one limit, see below), so if you're planning on pushing a cluster to the 100s of GBs or TBs scale, the findings here may not be sufficient to make an informed decision on the product.
+
+First off, a little background: [Kinesis](http://aws.amazon.com/kinesis/) is Amazon's solution for real-time data processing that's been designed from the get go for horizontal scalability, reliability, and low latency. It's a data store that can have records produced into it on one end, and consumed from it on the other. This may sound a little like a queue, but it differs from a queue in that every injected event is designed to be consumed as many times as necessary, allowing many consumers to read the stream simultaneously and in isolation from each other. To achieve this, records are persistent for a period of time (currently a 24 hour sliding window) so that any individual consumers can go offline and still come back and consume the stream from where they left off.
+
+The basic resource of Kinesis is a _stream_, which is a logical grouping of stream data around a single topic. To scale a stream, it can be subdivided into _shards_, each of which can handle 1 MB/s of data written to it and 2 MB/s of data read from it (I'd expect these numbers to change over time though, see [Kinesis limits](http://docs.aws.amazon.com/kinesis/latest/dev/service-sizes-and-limits.html)). _Records_ injected into a stream go a shard based on a _partition key_ chosen for each record by the user which is transformed using a hash function so that it maps to a single shard consistently. Partition keys offload scalability decisions to the Kinesis users, which is a simple way of achieving a powerful level of control over a Kinesis cluster in that they can make their own decisions around what type of record ordering is important to them.
+
+The most obvious alternative to Kinesis is [Kafka](http://kafka.apache.org/), which provides a similar model built on top of open-source Apache infrastructure. The systems differ significantly in implementation, but both aim to provide speed, scalability, and durability.
+
+Kinesis has a well-written [developer guide](http://docs.aws.amazon.com/kinesis/latest/dev/introduction.html) if you want to learn a little more about it. I've also written a few other articles on the subject. See [Guaranteeing Order with Kinesis Bulk Puts](/kinesis-order) and [Kinesis Shard Splitting & Merging by Example](/kinesis-by-example).
+
+## Performance and stability (#performance-and-stability)
+
+Probably of most paramount concern is how Kinesis performs in production. One thing to keep in mind when looking at these numbers is that Kinesis' durability characteristic is highly relevant. When injecting a record to a stream, that record is synchronously replicated to three different availability zones in the region to help guarantee that you'll get it out of the other side. There is a performance cost associated with this level of reliability, and comparing the numbers below to a single-node system like Redis (for example), would be nonsense.
+
+First off, we have put latency on the side of our producer. These metrics are generated from one of six different producer nodes running within the same AWS region as the Kinesis stream. All of these use the bulk put records API, and include a variable payload roughly in the range of 1 to 10 events. The Kinesis API operates over HTTP, and our code re-uses an already open connection to perform our operations whenever possible.
+
+As seen in the chart below, P50 manages to stay right around the 35 ms mark very consistently. P95 is usually right around 100 ms and P99 closer to 200 ms, but we don't see 300 ms broken under these metrics.
+
+
+
+ P50, P95, and P99 for bulk puts (seconds). P50 hovers around 35 ms.
+
+
+_(My apologies for these charts by the way, it seems that utilitarian things like axis labels and data legends aren't considered pretty enough by Librato to merit inclusion in a chart snapshot.)_
+
+Next up: time to fetch records from a Kinesis shard. As above, these numbers are from within the same region as the Kinesis stream and are bulk operations in that the consumers will fetch as many unconsumed records as are available. Reads from a Kinesis stream seem to be a little slower than writes, and we see P50 closer to 150 ms with P95 around 1 s and P99 a little over 2 s.
+
+
+
+ P50, P95, and P99 for fetching records (seconds). P50 hovers around 150 ms.
+
+
+Lastly, let's take a look at the total time that it takes a record to traverse the stream from its moment of inception on the producer to when it's in the hands of a consumer. Note that this not a perfect number in that includes some time that the record spends in our own application as it waits to be dispatched by a background process to Kinesis, but I've included it anyway given that many real-world Kinesis users may have a similar mechanism in their own stacks.
+
+P50 on this total throughput time sits right around 1.5 s, with P95 and P99 sitting a little further out around 5 s.
+
+
+
+ P50, P95, and P99 of time from production to consumption. P50 hovers around 1.50 s.
+
+
+All-in-all, I consider these numbers pretty good for a distributed system. In our particular use case, accuracy is more important then ultra low-latency throughput, so given the durability guarantees that Kinesis is getting us here, I'm more than willing to accept them.
+
+A little more on the qualitative side of observation, we've still yet to notice a serious operational problem in one of our Kinesis streams throughout the time that we've had them online. This doesn't give me much data to report on how well they behave during a degraded situation like a serious outage, but also demonstrates that the infrastructure is pretty reliable.
+
+## Limitations (#limitations)
+
+Now onto the part that may be the most important for the prospective Kinesis user: the product's limitations. I'm happy to report that I didn't find many, but those that I did find are significant.
+
+### You get five (reads) (#five-reads)
+
+Scalability is right there on the Kinesis front page as one of the core features of the product, and indeed it is scalable: by default a stream in US East can have up to 50 shards (this limit can be increased by opening a support ticket with AWS), each of which can handle 1 MB/s in and 2 MB/s out for a theoretically maximum of 50 MB/s in and 100 MB/s out. That's an incredible amount of data! However, despite being very scalable along this one dimension, it scales very poorly along another: the number of consumers that a stream can have.
+
+Each shard on a stream supports a maximum limit of 5 reads per second. That number is [right on the limits page](http://docs.aws.amazon.com/kinesis/latest/dev/service-sizes-and-limits.html), but at no point does the documentation really spell out its repercussions (or at least nowhere that I could find). If each shard only supports 5 read operations, and each application consuming the stream must consume every shard, then you can only have a _maximum of 5 applications if you want to read the stream once per second_. If you want to have ten applications consuming the stream, then you will have to limit each so that on average it only consumes the stream once every two seconds. Each read can pull down 10 MB of data, so keeping up with the stream isn't a problem, but you can see how you'll have to sacrifice latency in order to scale up the number of consumers reading the stream.
+
+This might be fine for a number of Kinesis use cases, but I was hoping to be able to build an infrastructure where I could have 100 or more applications consuming the stream. To achieve this I'd have to throttle each one back to only read on average of once every 20 seconds — a number that's unacceptable even for my relatively latency insensitive uses.
+
+Given that each shard already has a hard limit on input/output, I don't completely understand why such an aggressive per-shard limit on reads is necessary, but I can speculate that whatever mechanism they're using to perform a read is incredibly expensive. I've corresponded a little bit with Kinesis staff on it, but haven't been able to get a good answer. The Rube Goldberg-esque solution of chaining streams together by piping output of one to input of another until I've achieved sufficient fanout to support all my consumers was suggested a couple times, but that's a path that I wasn't too anxious to take.
+
+To help illustrate the problem, here's a chart of the number of "throughput exceeded" errors stemming from the read limit that our consumers ran into just over the past 30 minutes. This is only three consumers hitting a low throughput stream once a second.
+
+
+
+ Number of errors encountered due to read limits on a low volume stream over 30 minutes.
+
+
+### Vanishing history (#vanishing-history)
+
+As described in my previous article [Kinesis Shard Splitting & Merging by Example](/kinesis-by-example), a Kinesis shard is immutable in the sense that if it's split or merged, the existing shard is closed and new shards are created in its place. I find this to be quite an elegant solution to the problem of consumers trying to guarantee their read order across these events. To help consumers check that they're appropriately consumed closed shards to completion, the [DescribeStream](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStream.html) API endpoint allows them to examine the ancestry of each currently open shard and the range of sequence numbers that every closed shard handled during its lifetime.
+
+This is all well and good except that the list in `DescribeStream` is pruned on a regular basis and closed shards removed. This prevents a consumer that comes back online across a split and outside of the pruning window from determining whether it can come back online in a way that it's certain that it consumed all of the stream's data. No API parameter is available to request a complete list.
+
+Removing these old records of ancestry on such an aggressive schedule to save a few hundred bytes of JSON on an infrequently made API call seems like a pretty strange design decision to me. Like with the previous problem, corresponding with staff didn't help me gain any particular insight into its justification.
+
+## Comparison to Kafka (#kafka)
+
+I don't have a great deal of experience with Kafka (we ran a cluster for a few months but didn't put together any kind of analysis of worthwhile depth), so I'll keep this section short.
+
+One aspect of Kinesis that I did appreciate is the removal of the concept of a _topic_, which is a channel of sorts within a Kafka cluster that allows messages to be grouped together logically. The topic is an important feature when considering the maintenance and operation of Kafka in that it allows a single cluster to be re-used for a number of applications (and therefore fewer moving parts to keep an eye on), but as a developer, this isn't something that I really want to think about. If I want to send one type record I'll provision a Kinesis stream for it, and if I want to send a different type of record I'll provision a separate stream for that. From my perspective as an AWS user, those are two completely independent services that scale orthogonally and have their isolated sets of resources (rate limits, throughput capacity, and the like). As far as I'm concerned, this is a major win for hosted services.
+
+## Summary (#summary)
+
+I'm overall pretty happy with my experience with Kinesis, and we'll continue to run services on it for the foreseeable future. In general it behaves like a perfect infrastructure component in that it performs well and stays out of the way.
+
+By far the biggest rub for me is the 5 reads per second limit, which will certainly limit what we can do with the product. Admittedly, if I'd understood the consequences of this earlier, I probably would have pushed harder for Kafka, but it's not worth re-implementing what we already have over.
+
+For prospective users, I'd recommend Kinesis in most places just to help people stay out of the business of maintaining their own Kafka cluster and learning how to operate it. That said, if the read limit sounds like it may be a problem, it might be wise to investigate all possible options.
diff --git a/content/articles/kinesis-order.md b/content/articles/kinesis-order.md
new file mode 100644
index 000000000..380352bb4
--- /dev/null
+++ b/content/articles/kinesis-order.md
@@ -0,0 +1,151 @@
+---
+hook: On guaranteeing order with the bulk put API of an event stream.
+location: San Francisco
+published_at: 2015-03-05T01:13:46Z
+title: Guaranteeing Order with Kinesis Bulk Puts
+---
+
+Playing with Kinesis recently, we came across a problem of how to guarantee order when posting a set of records to its bulk API. This article summarizes it and talks about how although we never directly solved the problem that we thought we had, we were able to use a slightly altered approach to have the system meet some of the characteristics that we wanted to see.
+
+The basic primitive to send a record into the Kinesis API is [`PutRecord`](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html) which allows a producer to put a single record into the stream. The API has an optional request parameter called `SequenceNumberForOrdering` that allows a consumer to pass in a previously-generated sequence number to guarantee that no matter what sequence number is generated for the record, it will be larger than the one you had before.
+
+You can get a better feel for this idea with this sample dialog between a record producer and the Kinesis API:
+
+```
+REQUEST 1 [Producer]
+PutRecord record1.
+
+RESPONSE 1 [Kinesis]
+OK. SequenceNumber="123".
+
+REQUEST 2 [Producer]
+PutRecord record2. SequenceNumberForOrdering="123".
+
+RESPONSE 2 [Kinesis]
+OK. SequenceNumber="124".
+```
+
+Kinesis also provides a bulk API that allows many records to be injected into the stream at once called [`PutRecords`](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecords.html). The bulk API has a nice characteristic in that it allows up to 1000 records, or 1 MB, per second to be written. If one of your goals is great throughput, the economy of scale that you get by wrapping all your requests up into a single HTTP envelope and sending them all through at once is considerable.
+
+Use of the bulk API introduces a problem though in that guaranteeing order becomes more difficult. Kinesis will try to order new records according to how they came in with your payload, but any given record in that request can fail, and the semantics around failures dictate that the even if a failure does occur, every non-failed record in the batch will succeed normally. Responsibility falls to the producer to detect such failures and retry them on a subsequent requests.
+
+For example:
+
+```
+REQUEST 1 [Producer]
+PutRecords
+ - record1
+ - record2
+ - record3
+
+RESPONSE 1 [Kinesis]
+ - OK. SequenceNumber="123".
+ - FAILED.
+ - OK. SequenceNumber="124".
+
+REQUEST 2 (retry failed) [Producer]
+PutRecords
+ - record2
+
+RESPONSE 2 [Kinesis]
+ - OK. SequenceNumber="125".
+```
+
+Here we try to post three records in order (`record1`, `record2`, `record3`), but they will end up out-of-order in the stream due to a failure (`record1` and `record3` staying ordered, but the failed `record2` being injected into the stream only on a retry).
+
+So with `PutRecord` we can guarantee order at the cost of throughput, and if we use `PutRecords` we get improved throughput but without order. So the question is: is there a way that we can get both of these desirable characteristics?
+
+## Sharding (#sharing)
+
+Before we get there, let's briefly touch upon the concept of sharding with respect to a Kinesis stream. [As described in this architectural diagram](http://docs.aws.amazon.com/kinesis/latest/dev/key-concepts.html), a Kinesis stream is split into one or more shards for purposes of scalability; each shard has an upper limit on the amount of data that it can be written into it or read out of it (currently these limits at 1 MB/s and 2 MB/s respectively), so as the volume of data in a stream is increased, more shards can be added to achieve a form of horizontal scaling. Records within a shard are ordered according to how records were sent into them, and this order will be maintained when they're streamed out to a consumer. However, when producing to and consuming from multiple shards, no kind of ordering between shards can be guaranteed.
+
+Producers control which shards they're producing to by specifying a [partition key](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html) along with any record they're sending into the stream; both the `PutRecord` and bulk `PutRecords` APIs support (and require) one. Partition keys are mapped through a hash function which will result in one of the stream's shards being selected in a deterministic fashion. As long as the total number of shards in a stream has not changed, a partition key will consistently map to the same shard no matter how many times it's reused.
+
+So back to our original question: how can we guarantee that all records are consumed in the same order in which they're produced? The answer is that we can't, but that we shouldn't let that unfortunate reality bother us too much. Once we've scaled our stream to multiple shards, there's no mechanism that we can use to guarantee that records are consumed in order across the whole stream; only within a single shard. So instead of focusing on a global guarantee of ordering, we should instead try to to leverage techniques that will get us as much throughput as possible, and fall back to techniques that allow us to control for certain subsets of records where we deem it necessary.
+
+## Sequential puts per partition key; bulk otherwise (#per-partition)
+
+To achieve the above, we're using a simple algorithm on our producers:
+
+``` ruby
+while records = more_records()
+ records_to_post = records.
+ group_by { |record| record.partition_key }.
+ map { |_, partition_group| partition_group.first }
+ kinesis.put_records(records_to_post)
+end
+```
+
+Most of the time all pending records that need to be sent into the stream will be posted to Kinesis as a single bulk batch. However, if we find multiple records that have the same partition key, we only post the first one that was produced, and wait for the next cycle through to post any other events in the same partition.
+
+Let's solidify this idea a little by using an example. In our production system, we pool records in a Postgres database before streaming them out to Kinesis. That database has a schema that looks like this:
+
+```
+=> SELECT partition_key, record_data FROM kinesis_records ORDER BY id;
+
+ partition_key | record_data
+--------------------------------------+---------------------------------------
+ 8a9e7a19-9fe1-49b2-9b42-591520784449 | {"resource":"app","action":"create"}
+ d0d97986-0c90-404f-bccd-9ac6c27f9235 | {"resource":"app","action":"create"}
+ 8a9e7a19-9fe1-49b2-9b42-591520784449 | {"resource":"app","action":"update"}
+ 8a9e7a19-9fe1-49b2-9b42-591520784449 | {"resource":"app","action":"destroy"}
+ b20d88bc-ba68-41e3-87cb-3a93cc619833 | {"resource":"app","action":"update"}
+(5 rows)
+```
+
+When we want to select a batch of records to stream, we'll use an SQL query that partitions our pending records over `partition_key` and selects the first record for each partition:
+
+```
+=> SELECT partition_key, record_data FROM kinesis_records
+WHERE id IN (
+ SELECT MIN(id)
+ FROM kinesis_records
+ GROUP BY partition_key
+)
+ORDER BY id;
+
+ partition_key | record_data
+--------------------------------------+---------------------------------------
+ 8a9e7a19-9fe1-49b2-9b42-591520784449 | {"resource":"app","action":"create"}
+ d0d97986-0c90-404f-bccd-9ac6c27f9235 | {"resource":"app","action":"create"}
+ b20d88bc-ba68-41e3-87cb-3a93cc619833 | {"resource":"app","action":"update"}
+(3 rows)
+```
+
+In the data set above, those first three records would all be posted in a single batch. The query would then run again and fetch the next record in the `8a9e7a19-...` sequence:
+
+```
+ partition_key | record_data
+--------------------------------------+---------------------------------------
+ 8a9e7a19-9fe1-49b2-9b42-591520784449 | {"resource":"app","action":"update"}
+```
+
+That would be posted by itself in a second batch. The worker would then run one more time to fetch the final record for that partition and post it as a third batch:
+
+```
+ partition_key | record_data
+--------------------------------------+---------------------------------------
+ 8a9e7a19-9fe1-49b2-9b42-591520784449 | {"resource":"app","action":"destroy"}
+```
+
+By partitioning over our known key and selecting the first result ordered by the table's sequential ID, we achieve the same effect as the pseudocode algorithm above, resulting in a set of records with unique partition keys that are safe to post in bulk even if a failure ends up ordering it in a way that we didn't intend.
+
+## Partition key selection (#partition-key)
+
+A side effect of this approach is that the selection of a partition key that's logical for your records becomes one of the most important concerns when starting to stream a new type of record. The partition key is the only mechanism available for controlling the order in which records are streamed to consumers, and some consideration must be taken when selecting partition keys to ensure that all records in the stream will play nicely together.
+
+As an example, consider a GitHub-like service that would like to stream all repository-related events that occur within it through a Kinesis stream. We'd like to stream three types of events:
+
+1. Create repository.
+2. Destroy repository.
+3. Commit to repository.
+
+Repositories can be referenced through a combination of account and project name (e.g. `brandur/my-project` like you'd see after `github.com/`) and commits can be referenced by their SHA hash (e.g. `c0ab1e5c...`).
+
+Say we have a consumer on the Kinesis stream that's maintaining some basic state like a cache of all repositories and commits that are known to the service. When a new repository is created, it will set a key for it in a Redis store, and when that repository is destroyed, it will remove that key.
+
+We can see even in this basic example that if our cache consumer receives a create and destroy event for the same repository out of order, it will be left with invalid data. After receiving the destroy event first, it will remove a cache key that was never set, and then after receiving the misordered create event, it will set a cache key for a repository which is no longer valid. To ensure that these events are always received by all consumers in the correct order, we can set the partition key of the create and destroy events to the same value so that they'll end up on the same shard. In this case, the name of the repository that they're related to (`brandur/my-project`) is a good candidate for this shared key. Kinesis will consistently translate the repository name to a single shard, and all consumers listening to that shard will receive the events that are emitted from it in the expected order.
+
+The same principle applies to streamed commits as well. It might be tempting at first glance to partition commits based on their SHA identifier (`c0ab1e5c`), but if we did so, we would open up the possibility of a consumer receiving a commit for a repository that doesn't exist because it's already processed a destroy event for that repository that came out of a different shard. We can solve this problem by assigning the partition key of commit events to the identifier of their parent repository (again, `brandur/my-project` instead of `c0ab1e5c`) so that they'll end up on the same shard as the rest of the repository's events.
+
+With this system, we accept that consumers may not be consuming events in exactly the same order in which we emitted them, but we also know that order will be guaranteed when it matters. The result is much improved scalability in the way that the stream can be split into any number of shards and our consumers will continue to consume the stream correctly.
diff --git a/content/articles/logfmt.md b/content/articles/logfmt.md
new file mode 100644
index 000000000..6bfaf6757
--- /dev/null
+++ b/content/articles/logfmt.md
@@ -0,0 +1,129 @@
+---
+hook: A logging format used inside companies such as Heroku and Stripe which is optimal
+ for easy development, consistency, and good legibility for humans and computers.
+location: San Francisco
+published_at: 2013-10-28T16:28:04Z
+title: logfmt
+---
+
+If you've ever run an app on Heroku, you may have come across log messages produced by the Heroku router and wondered about their unusual formatting:
+
+ at=info method=GET path=/ host=mutelight.org fwd="124.133.52.161"
+ dyno=web.2 connect=4ms service=8ms status=200 bytes=1653
+
+This curious format is unofficially known as "logfmt", and at Heroku we've adopted it as a standard to provide some consistency across internal components. I've never been able to find any good posts providing any context or background for logfmt, so I've put together this short write-up.
+
+At its core, logfmt is just a basic way of displaying key/value pairs in such a way that its output is readable by a human or a computer, while at the same time not being absolutely optimal for either.
+
+Especially with a bit of practice and colorized output, it's pretty easy for a human being to see what's going on here which is of course a core value for any good logging format. At the same time, building a machine parser for the format is trivial so any of our internal components can ingest logs produced by any other component. [Splunk also recommends the same format under their best practices](http://dev.splunk.com/view/logging-best-practices/SP-CAAADP6) so we can be sure that it can be used to search and analyze all our logs in the long term.
+
+## Eliminate the guesswork (#eliminate-guesswork)
+
+A major advantage provided by logfmt is that it helps to eliminate any guesswork that a developer would have to make while deciding what to log. Take the following line in a more traditional logging format for example:
+
+ INFO [ConsumerFetcherManager-1382721708341] Stopping all fetchers
+ (kafka.consumer.ConsumerFetcherManager)
+
+While writing this code, a developer would've had to decide how to format the log line like placing the manager's identifier in square brackets at the beginning, the module name in parenthesis at the end, with some general information in the middle. Convention can help a lot here, but it's still something that a developer has to think about it. Furthermore, what if they want to add another piece of data like number of open fetchers? Does that belong on a new line, or in another set of brackets somewhere?
+
+An equivalent logfmt line might look this:
+
+ level=info tag=stopping_fetchers id=ConsumerFetcherManager-1382721708341
+ module=kafka.consumer.ConsumerFetcherManager
+
+Readability isn't compromised too much, and all the developer has to do is dump any information that they think is important. Adding another piece of data is no different, just append `num_open_fetchers=3` to the end. The developer also knows that if for any reason they need to generate a statistic on-the-fly like the average number of fetchers still open, they'll easily be able to do that with a simple Splunk (or equivalent) query:
+
+ tag=stopping_fetchers | stats p50(num_open_fetchers) p95(num_open_fetchers)
+ p99(num_open_fetchers)
+
+## Human logfmt (#human)
+
+**Note —** Added after original publication (on March 30, 2016) to
+reflect changes to the recommended best practices.
+
+logfmt may be more readable than something like JSON, but it's still difficult
+to scan quickly for humans. To improve that, I'd recommend using the approach
+seen in [logrus][logrus] and including a human readable message with every log
+line:
+
+ level=info msg="Stopping all fetchers"
+ tag=stopping_fetchers id=ConsumerFetcherManager-1382721708341
+ module=kafka.consumer.ConsumerFetcherManager
+
+In development, a log output formatter can then give the `msg` field special
+treatment by displaying it in way that a human can easily read (along with
+other special fields like `level`):
+
+ info | Stopping all fetchers module=kafka.consumer.ConsumerFetcherManager
+ info | Performing log compaction module=kafka.compacter.LogCompactionManager
+ info | Performing garbage collection module=kafka.cleaner.GarbageCollectionManager
+ info | Starting all fetchers module=kafka.consumer.ConsumerFetcherManager
+
+I'd also recommend introducing a convention to assign a machine-friendly "tag"
+field so that any of these lines can still easily be found with a Splunk
+search:
+
+ info | Stopping all fetchers tag=stopping_fetchers module=kafka.consumer.ConsumerFetcherManager
+ info | Performing log compaction tag=log_compaction module=kafka.compacter.LogCompactionManager
+ info | Performing garbage collection tag=garbage_collection module=kafka.cleaner.GarbageCollectionManager
+ info | Starting all fetchers tag=starting_fetchers module=kafka.consumer.ConsumerFetcherManager
+
+## Building context (#building-context)
+
+logfmt also lends itself well to building context around operations. Inside a request for example, as important information becomes available, it can be added to a request-specific context and included with every log line published by the app. This may not seem immediately useful, but it can be very helpful while debugging in production later, as only a single log line need be found to get a good idea of what's going on.
+
+For instance, consider this simple Sinatra app:
+
+``` ruby
+def authenticate!
+ @user = User.authenticate!(env["HTTP_AUTHORIZATION"]) || throw(401)
+ log_context.merge! user: @user.email, user_id: @user.id
+end
+
+def find_app
+ @app = App.find!(params[:id])
+ log_context.merge! app: @app.name, app_id: @app.id
+end
+
+before do
+ log "Starting request", tag: "request_start"
+end
+
+get "/:id" do
+ authenticate!
+ find_app!
+end
+
+after do
+ log "Finished request", tag: "request_finish", status: response.status
+end
+
+error do
+ e = env["sinatra.error"]
+ log "Request errored", tag: "request_error",
+ error_class: e.class.name, error_message: e.message
+end
+```
+
+Typical logging produced as part of a request might look like this:
+
+ msg="Request finished" tag=request_finish status=200
+ user=brandur@mutelight.org user_id=1234 app=mutelight app_id=1234
+
+The value becomes even more apparent when we consider what would be logged on an error, which automatically contains some key information to help with debugging (note that in real life, we'd include a stack trace as well):
+
+ msg="Request errored" tag=request_error error_class=NoMethodError
+ error_message="undefined method `serialize' for nil:NilClass"
+ user=brandur@mutelight.org user_id=1234 app=mutelight app_id=1234
+
+## Implementations (#implementations)
+
+A few projects from already exist to help parse logfmt in various languages:
+
+* [logfmt for Clojure](https://github.com/tcrayford/logfmt)
+* [logfmt for Go](http://godoc.org/github.com/kr/logfmt)
+* [logfmt for Node.JS](https://github.com/csquared/node-logfmt)
+* [logfmt for Python](https://pypi.python.org/pypi/logfmt/0.1)
+* [logfmt for Ruby](https://github.com/cyberdelia/logfmt-ruby)
+
+[logrus]: https://github.com/Sirupsen/logrus
diff --git a/content/articles/mediator.md b/content/articles/mediator.md
new file mode 100644
index 000000000..bba5e8fcc
--- /dev/null
+++ b/content/articles/mediator.md
@@ -0,0 +1,258 @@
+---
+hook: Interactors by a different name.
+location: San Francisco
+published_at: 2014-03-11T15:25:07Z
+title: The Mediator Pattern
+---
+
+Grouper published a post last week [about how they use interactors](http://eng.joingrouper.com/blog/2014/03/03/rails-the-missing-parts-interactors) in their Rails app to help keep their ActiveRecord models as lean as possible. Somewhat amusingly, while doing a major refactor of the Heroku API, we'd independently arrived at a nearly identical pattern after learning the hard way that callbacks and large models are the inviting pool whose frothy water conceals treacherous rocks.
+
+The main difference was in appellation: we called the resulting PORO's "mediators", a [design pattern](http://en.wikipedia.org/wiki/Mediator_pattern) that defines how a set of objects interact. I'm not one to quarrel over naming, but I'll use the term "mediator" throughout this article because that's how I'm used to thinking about this pattern.
+
+The intent of this article is to build on what Grouper wrote by talking about some other nice patterns that we've built around the use of mediators/interactors.
+
+## Lean endpoints (#lean-endpoints)
+
+One goal of our usage of mediators is to consolidate all the business logic that might otherwise have to reside in a combination of an API endpoint's body and methods on models. Ideally what remains in the endpoint should be a set of request checks like authentication, ACL, and parameters; a single call down to a mediator; and response logic like serialization and status.
+
+Here's a small (and slightly beautified) excerpt from the API endpoint for creating an SSL Endpoint:
+
+``` ruby
+module API::Endpoints::APIV3
+ class SSLEndpoints < Base
+ ...
+
+ namespace "/apps/:id/ssl-endpoints" do
+ before do
+ authorized!
+ @ap = get_any_app!
+ check_permissions!(:manage_domains, @ap)
+ check_params!
+ end
+
+ post do
+ @endpoint = API::Mediators::SSLEndpoints::Creator.run(
+ auditor: self,
+ app: @ap,
+ key: v3_body_params[:private_key],
+ pem: v3_body_params[:certificate_chain],
+ user: current_user
+ )
+ respond serialize(@endpoint), status: 201
+ end
+ end
+
+ ...
+ end
+end
+```
+
+This pattern produces a convention that helps keep important logic out of endpoints and in the more readily accessible mediator classes. It also keeps unit tests for the endpoints focused on what those endpoints are responsible for: authentication, parameter and permission checks, serialization, and the like. For success cases, we can mock out the mediator's call and response and focus on doing more comprehensive tests on the business logic in the mediator's own unit tests. The entire stack still gets exercised at the integration test level, but we don't have to get into the same level of exhaustive testing there.
+
+A mocked endpoint unit test might look like the following (note that the specs are using the [rr](https://github.com/rr/rr) mocking syntax):
+
+``` ruby
+# endpoint unit tests
+describe API::Endpoints::APIV3::SSLEndpoints do
+ ...
+
+ describe "POST /apps/:id/ssl-endpoints" do
+ it "calls into the mediator" do
+ mock(API::Endpoints::APIV3::SSLEndpoints).run(hash_including({
+ app: @app,
+ key: "my-private-key",
+ pem: "my-pem",
+ user: @user,
+ })
+ authorize "", @user.api_key
+ header "Content-Type", "application/json"
+ post "/apps/#{@app.name}/ssl-endpoints", MultiJson.encode({
+ private_key: "my-private-key",
+ certificate_chain: "my-pem",
+ })
+ end
+ end
+
+ ...
+end
+```
+
+The mediator unit tests will go into far greater detail and look something like this:
+
+``` ruby
+# mediator units tests
+describe API::Mediators::SSLEndpoints::Creator do
+ ...
+
+ it "produces an SSL Endpoint" do
+ endpoint = run
+ assert_kind_of API::Models::SSLEndpoint, endpoint
+ end
+
+ it "makes a call to the Ion API to create the endpoint" do
+ mock(IonAPI).create_endpoint
+ run
+ end
+
+ ...
+
+ private
+
+ def run(options = {})
+ API::Mediators::SSLEndpoints::Creator.run({
+ app: @app,
+ key: @key_contents,
+ pem: @pem_contents,
+ user: @app.owner,
+ }.merge(options))
+ end
+end
+```
+
+## Lean jobs (#lean-jobs)
+
+Much in the same way that mediators keep our endpoints lean, they do the same for our async jobs. By encapsulating all business logic into a mediator, we leave jobs to focus only one two things:
+
+1. **Model materialization:** Async jobs are passed through some kind of backchannel like a database table or a redis queue, and have to marshaled on the other side. It's up to the job to figure out how to find and instantiate the models that it needs to inject into its mediator. This logic may change job to job: if we have a job to create a logging channel for an app, but that app has already been deleted by the time it runs, then we should fall through the job without an error; but if we have an async job to a destroy an app, and its record is no longer avaiable, then something unexpected happen and we should raise an error.
+2. **Error handling:** A job's second responsibility is to rescue errors and figure out what to do with them. If we're trying to provision an SSL Endpoint and got a connection error to our downstream endpoints service, then we might want to send the job back into the work queue; but if something like a configuration error occurred, we might want to notify our error service and fail the job permanently.
+
+Let's look at what an async job might look like for the hypothetical SSL Endpoint creation mediator from above:
+
+``` ruby
+module API::Jobs::SSLEndpoints
+ class Creator < API::Jobs::Base
+ def initialize(args = {})
+ super
+ require_args!(
+ :app_id,
+ :key,
+ :pem,
+ :user_id
+ )
+ end
+
+ def call
+ # If the app is no longer present, then it's been deleted since the job
+ # was dequeued; succeed without doing anything.
+ return unless @app = App.find_by_id(args[:app_id])
+
+ # If the user is no longer present, then they may have deleted their
+ # account isince the job was dequeued; succeed without doing anything.
+ return unless @user = User.find_by_id(args[:user_id])
+
+ API::Mediators::SSLEndpoints::Creator.run(
+ auditor: self,
+ app: @app,
+ key: args[:key],
+ pem: args[:pem],
+ user: @user
+ )
+
+ # Something is wrong which will prevent the job from ever succeeding. Fail
+ # the job permamently and notify operators of the error.
+ rescue API::Error::ConfigurationMissing => e
+ raise API::Error::JobFailed.new(e)
+
+ # Something has caused a temporary disruption in service. Queue the job
+ # again for retry.
+ rescue Excon::Errors::Error
+ raise API::Error::JobRetry
+ end
+ end
+end
+```
+
+(Note that the above is a simplified example. If you were going to send a sensitive secret like an SSL key through an insecure channel, we'd want to encrypt it.)
+
+## Strong preconditions (#strong-preconditions)
+
+From within any mediator, we assume that a few preconditions have already been met:
+
+* **Parameters:** All parameters are present in their expected form.
+* **Models:** Rather than passing around abstract identifiers, parameters are materialized models so that no look-up logic needs to be included.
+* **Security:** Security checks like authentication and access control have already been made.
+
+Making these strong assumptions has a number of advantages:
+
+* The complexity of the resulting code is reduced dramatically. We don't have to spend LOCs checking that objects are present or whether they're in their expected form (almost like working in a strongly typed language!).
+* It eases testing as the boilerplate for checking parameter validation and the like can be consolidated elsewhere.
+* Allows mediators to be called more easily from outside their normal context like from a debugging/operations console session.
+
+## Mediators all the way down (#nesting-mediators)
+
+One way to think about mediators is that they encapsulate a discrete piece of work that involves interaction between a set of objects; a piece of work that otherwise might have ended up in an unwieldy method on a model. Because units of work are often composable, just like those model methods would have been, it's a common pattern for mediators to make calls to other mediators.
+
+Here's a small example of an app mediator that also deprovisions the app's installed add-ons:
+
+``` ruby
+module API::Mediators::Apps
+ class Destroy < API::Mediators::Base
+ ...
+
+ def destroy_addons
+ @app.addons.each do |addon|
+ API::Mediators::Addons::Destroyer.run(
+ addon: addon,
+ auditor: @auditor,
+ )
+ end
+ end
+
+ ...
+ end
+```
+
+Of course it's important that your mediators have a clear call hierarchy so as not to develop any circular dependencies, but as long as developers don't get too overzealous with mediator creation, this is pretty safe.
+
+## Patterns through convention (#convention)
+
+While establishing mediators as the default unit of work, it's also a convenient time to start building other useful conventions into them. For example, we build in an auditing pattern so that we're still able to produce a trail of audit events even the mediator's work is performed from unexpected places like a console:
+
+``` ruby
+module API::Mediators::Apps
+ class Destroy < API::Mediators::Base
+ ...
+
+ def call
+ audit do
+ ...
+ end
+ end
+
+ private
+
+ def audit(&block)
+ @auditor.audit("destroy-app", target_app: @app, &block)
+ end
+ end
+end
+```
+
+Another example of an established convention is to try and build out call bodies composed of a series of one-line calls to helpers that produces a very readable set of operations that any given mediator will perform:
+
+``` ruby
+module API::Mediators::Apps
+ class Destroy < API::Mediators::Base
+ ...
+
+ def call
+ audit do
+ App.transaction do
+ destroy_addons
+ destroy_domains
+ destroy_ssl_endpoints
+ close_payment_method_history
+ close_resource_histories
+ delete_logplex_channel
+ @app.destroy
+ end
+ end
+ end
+
+ ...
+ end
+```
+
+A few years into working with the mediator pattern now, and I'd never go back. Although mediator calls are a little more verbose than they might have been as a model methods, they've allowed us to lean out the majority of our models to contain only basics like assocations, validations, and accessors. This has the added advantage of leaving us more decoupled from our ORM (ActiveRecord in this case) than ever before.
+
+Eliminating callbacks has also been a hugely important step forward in that it reduces production incidents caused by running innocent-looking code that results in major side effects, and leaves us with more transparent test code.
diff --git a/content/articles/microservices.md b/content/articles/microservices.md
new file mode 100644
index 000000000..b504b3c7c
--- /dev/null
+++ b/content/articles/microservices.md
@@ -0,0 +1,69 @@
+---
+hook: Useful distinction or new buzzword? Comments on 200-500 line services.
+location: San Francisco
+published_at: 2014-04-01T12:37:17Z
+title: Microservices
+---
+
+A recent article by James Lewis and Martin Fowler has kicked off a discussion about "microservices" and SOA in general; one of the few topics in the API community besides Hypermedia that's pretty much guaranteed to generate some healthy debate. The [original body of work](http://martinfowler.com/articles/microservices.html) defines the characteristics of a microservice:
+
+* Services are out-of-process components that can be deployed and operated independently.
+* Organizes teams around _business capability_ so that each service is run by a cross-functional group.
+* Services communicate via lightweight HTTP interfaces. A heavy [ESB](http://en.wikipedia.org/wiki/Enterprise_service_bus) is no longer needed to choreograph data.
+* Services are language agnostic and can be built according to the technologies that a team selects.
+* Services don't share databases.
+
+At least some portion of the community expresses some incredulity that the talking points are enough to justify a distinct concept from the more classic idea of SOA that was first put forward [somewhere in the neighborhood of 2005](http://books.google.com/books/about/Service_Oriented_Architecture.html?id=qLrLngEACAAJ). Many companies have architectures that would check all the boxes above without ever having heard the word "microservice".
+
+Even among its originators, the term doesn't seem to have a perfect definition with wide consensus. A talk by [Fred George on the subject in Barcelona](https://www.youtube.com/watch?v=2rKEveL55TY) adds the "micro-" back in microservices and puts forward some more radical assertions regarding their nature:
+
+* Services should be 200-500 LOC.
+* Self-monitoring services replace unit tests, and business monitoring replaces acceptance tests.
+* The system is long-lived while its services are short-lived. Services are disposed as refinements come along to re-work the architecture.
+
+## Autonomy of small services (#autonomy)
+
+Although microservices might be SOA with a modern veneer of HTTP, JSON, and polygot, the concept of a "micro microservice" (that 200-500 LOC sweet spot) is worth considering in a bit more depth. In my own experience, not all services can fit into this size, but services that do are remarkably more stable than their counterparts --- and for anyone who's an operator as well as a developer, building a completely autonomous system is an idyllic vision well worthy of pursuit.
+
+These tiny services have some major advantages over their heavier counterparts:
+
+* They have an inherently smaller surface area, and their development can be continually iterated until all bugs are squashed.
+* Due to their small area of responsibility, they're rarely in the same state of constant change that's a matter of course many larger codebases. Less change is natural protection against new bugs or regressions.
+* In many cases their resource use will be smaller, which could help avoid a class of bugs stemming from overutilization like GC pauses, out-of-memory errors, swapping, etc.
+* May be able to use only a very reliable data store like S3 or even be made completely stateless, which can avoid a single point of failure like a relational database.
+
+I ran some inventory of our own production services, and came up with a few that do in fact make the 500 LOC microservice cut:
+
+* **Addons SSO:** 171 LOC. This tiny service authenticates a user via OAuth, then asks the API to sign a request on their behalf before redirecting. Powers `heroku addons:open`.
+* **Anvil:** 337 LOC. [A platform-powered build system](https://github.com/ddollar/heroku-anvil) that compiled slugs and released them directly rather than using the more traditional `git push heroku master` route.
+* **Cloner:** 305 LOC. A tiny app that authenticates via OAuth and makes an API call. Powers [java.heroku.com](https://java.heroku.com).
+* **Zendesk SSO:** 348 LOC. Creates Zendesk accounts for new Heroku users so that they can open support tickets.
+
+A few others didn't quite make weight, but are still small:
+
+* **Deployhooks:** 1240 LOC. A small service that powers the [Heroku Deployhooks add-on](https://devcenter.heroku.com/articles/deploy-hooks).
+* **Scheduler:** 630 LOC. Powers the web frontend for the [Heroku Scheduler add-on](https://devcenter.heroku.com/articles/scheduler).
+* **Vixie:** 805 LOC. Powers the backend of Heroku's Scheduler add-on, and receives instructions from the scheduler above.
+
+_(Jamie Hodge points out that many of these services lack full databases and therefore might better be described as coordinators. The more limited state which they maintain simplifies their operational characteristics to a large degree.)_
+
+One common trait of all the services listed above is that their autonomy is remarkable. We have some basic alarms on them in case they go down, but they go off rarely. Being deployed on the Heroku platform is certainly a big help here, but also that their concerns are so narrow and unchanging that there isn't a lot of room for bugs to hide.
+
+I suspect that 500 LOC isn't enough to box in all concerns of many components, even if they are quite narrow in scope -- anecdotally, most of our more important services easily blow past this limit. I'm also not quite at the point where I'd replace my unit or acceptance tests with self- or business monitoring (also one of the assertions made above).
+
+## SOA isn't a silver bullet (#downsides)
+
+SOA bestows a huge number of architectural advantages, but we probably want to be careful to keep its expectations in check. Boiled down to a fundamental level, SOA is about introducing very hard isolation between components that can result in big gains in organizational and operational effectiveness, but by extension leads to component boundaries that are more difficult to evolve.
+
+The less desirable features of a SOA-like system include the following:
+
+* Any change to the contract between two services will require coordinated development and deployment on both sides. This can be especially slow if those services are managed by different groups of people.
+* Data beccomes much more widely distributed and more difficult to inspect and query. This can be solved with something like a data warehouse, but that's another component to build and manage.
+* There is some overhead to building the platform that will enable the deployment of new services. Tools like Docker (or Heroku) will help with this, but new services still need metrics dashboards, alarms, deployment and operation tools, etc.
+* More integration testing has to be moved out of individual components and up into a working system to be effective, this is inevitably slower and more opaque.
+
+Although not meant to sound too bearish on microservice type architectures, the short version is that it's not just about building the system --- it's also about building the tools, processes, and infrastructure to operate it. Larger companies will almost inevitably have to move to something that looks like SOA to keep forward progress possible, but it might not make sense for smaller shops to rush into it headlong, even if the technology is really cool.
+
+[Wikipedia](http://en.wikipedia.org/wiki/Service-oriented_architecture) nails this idea in a single line:
+
+> Significant vendor hype surrounds SOA, which can create exaggerated expectations.
diff --git a/content/articles/minimalism.md b/content/articles/minimalism.md
new file mode 100644
index 000000000..a0dd573ba
--- /dev/null
+++ b/content/articles/minimalism.md
@@ -0,0 +1,231 @@
+---
+title: In Pursuit of Production Minimalism
+published_at: 2017-05-10T13:35:02Z
+location: San Francisco
+hook: Practicing minimalism with the lofty goal of total
+ ephemeralization to build coherent, stable, and operable
+ stacks.
+attributions: Photographs by Ben Harrington (SR-71), Robyn Jay (embers of a burning fire), and Md. Al Amin (boat and sky). Licensed under Creative Commons BY-NC-ND 2.0, BY-SA 2.0, and CC BY 2.0 respectively.
+---
+
+While working at Lockheed during the cold war, Kelly
+Johnson was reported to have coined [KISS][kiss] ("keep it
+simple, stupid"); a principle that suggests glibly that
+systems should be designed to be as simple as possible.
+
+While complexity is never a conscious design goal of any
+project, it arises inherently as new features are pursued
+or new components are introduced. KISS encourages designers
+to actively counteract this force by making simplicity an
+objective in itself, and thus produce products that are
+more maintainable, more reliable, and more flexible. In the
+case of jet fighters, that might mean a plane that can be
+repaired in the field with few tools and under the
+stressful conditions of combat.
+
+During his tenure, Lockheed's Skunk Works would produce
+planes like the U-2 and SR-71; so notable for their
+engineering excellence that they've left a legacy that we
+reflect on even today.
+
+!fig src="/assets/minimalism/sr71.jpg" caption="The famous SR-71, one of the flag ships of Lockheed's Skunk Works. Very fast even if not particularly simple."
+
+## Minimalism in technology (#in-technology)
+
+Many of us pursue work in the engineering field because
+we're intellectually curious. Technology is cool, and new
+technology is even better. We want to be using what
+everyone's talking about.
+
+Our news sources, meetups, conferences, and even
+conversations bias towards shiny new tech that's either
+under active development or being energetically promoted.
+Older components that sit quietly and do their job well
+disappear into the background.
+
+Over time, technologies are added, but are rarely removed.
+Left unchecked, production stacks that have been around
+long enough become sprawling patchworks combining
+everything under the sun. This effect is dangerous:
+
+* More parts means more cognitive complexity. If a system
+ becomes too difficult to understand then the risk of bugs
+ or operational mishaps increases as developers make
+ changes without understanding all the intertwined
+ concerns.
+
+* Nothing operates flawlessly once it hits production.
+ Every component in the stack is a candidate for failure,
+ and with sufficient scale, _something_ will be failing all
+ the time.
+
+* With more technologies engineers will tend to be come
+ jacks of all trades, but masters of none. If a
+ particularly nefarious problem comes along, it may be
+ harder to diagnose and repair because there are few
+ specialists around who are able to dig deeply.
+
+Even knowing this, the instinct to expand our tools is hard
+to suppress. Oftentimes persuasion is a core competency of
+our jobs, and we can use that same power to convince
+ourselves and our peers that it's critical to get new
+technologies into our stack _right now_. That Go-based HA
+key/value store will take our uptime and fault resilience
+to new highs. That real-time event stream will enable
+immutable ledger that will become foundational keystone for
+the entire platform. That sexy new container orchestration
+system that will take ease of deployment and scaling to new
+levels. In many cases, a step back and a moment of
+dispassionate contemplation will reveal that their use
+could be abstained until a time when they're known to be
+well vetted, and it's well understood how they'll fit into
+the current architecture (and what they'll replace).
+
+## Minimalism through ephemeralization (#ephemeralization)
+
+In his book _Nine Chains to the Moon_ (published 1938),
+inventor R. Buckminster Fuller described the idea of
+***ephemeralization***:
+
+> Do more and more with less and less until eventually you
+> can do everything with nothing.
+
+It suggests improving increasing productive output by
+continually improving the efficiency of a system even while
+keeping input the same. I project this onto technology to
+mean building a stack that scales to more users and more
+activity while the people and infrastructure supporting it
+stay fixed. This is accomplished by building systems that
+are more robust, more automatic, and less prone to problems
+because the tendency to grow in complexity that's inherent
+to them has been understood, harnessed, and reversed.
+
+For a long time we had a very big and very aspirational
+goal of ephemeralization at Heroku. The normal app platform
+that we all know was referred to as "user space" while the
+internal infrastructure that supported it was called
+"kernel space". We want to break up the kernel in the
+kernel and move it piece by piece to run inside the user
+space that it supported, in effect rebuilding Heroku so
+that it itself ran _on Heroku_. In the ultimate
+manifestation of ephemeralization, the kernel would
+diminish in size until it vanished completely. The
+specialized components that it contained would be retired,
+and we'd be left a single perfectly uniform stack.
+
+Realistic? Probably not. Useful? Yes. Even falling short of
+an incredibly ambitious goal tends to leave you somewhere
+good.
+
+## Minimalism in practice (#in-practice)
+
+Here are a few examples of minimalism and ephemeralization
+in practice from Heroku's history:
+
+* The core database that tracked all apps, users, releases,
+ configuration, etc. used to be its own special snowflake
+ hosted on a custom-built AWS instance. It was eventually
+ folded into Heroku Postgres, and became just one more
+ node to be managed along with every other customer DB.
+
+* Entire products were retired where possible. For example,
+ the `ssl:ip` add-on (providing SSL/TLS terminate for an
+ app), which used to be provisioned and run on its own
+ dedicated servers, was end-of-lifed completely when a
+ better (and cheaper) option for terminating SSL was
+ available through Amazon. With SNI support now
+ widespread, `ssl:endpoint` will eventually follow suit.
+
+* All non-ephemeral data was moved out of Redis so that the
+ only data store handling persistent data for internal
+ apps was Postgres. This had the added advantage of stacks
+ being able to tolerate a downed Redis and stay online.
+
+* After a misguided foray into production polyglotism, the
+ last component written in Scala was retired. Fewer
+ programming languages in use meant that the entire system
+ became easier to operate, and by more engineers.
+
+* The component that handled Heroku orgs was originally run
+ as its own microservice. It eventually became obvious
+ that there had been a time when our microservice
+ expansion had been a little overzealous, so to simplify
+ operation, we folded a few services back into the hub.
+
+To recognize the effort that went into tearing down or
+replacing old technology, we created a ritual where we
+symbolically fed dead components to a flame called a [burn
+party](/fragments/burn-parties). The time and energy spent
+on some of these projects would in some cases be as great,
+or even greater, as it would for shipping a new product.
+
+!fig src="/assets/minimalism/fire.jpg" caption="At Heroku, we'd hold regular \"burn parties\" to recognize the effort that went into deprecating old products and technology."
+
+## Minimalism in ideas (#in-ideas)
+
+Practicing minimalism in production is mostly about
+recognizing that the problem exists. After achieving that,
+mitigations are straightforward:
+
+* ***Retire old technology.*** Is something new being
+ introduced? Look for opportunities to retire older
+ technology that's roughly equivalent. If you're about to
+ put Kafka in, maybe you can get away with retiring Rabbit
+ or NSQ.
+
+* ***Build common service conventions.*** Standardize on
+ one database, one language/runtime, one job queue, one
+ web server, one reverse proxy, etc. If not one, then
+ standardize on _as few as possible_.
+
+* ***Favor simplicity and reduce moving parts.*** Try to
+ keep the total number of things in a system small so that
+ it stays easy to understand and easy to operate. In some
+ cases this will be a compromise because a technology
+ that's slight less suited to a job may have to be re-used
+ even if there's a new one that would technically be a
+ better fit.
+
+* ***Don't use new technology the day, or even the year,
+ that it's initially released.*** Save yourself time and
+ energy by letting others vet it, find bugs, and do the
+ work to stabilize it. Avoid it permanently if it doesn't
+ pick up a significant community that will help support it
+ well into the future.
+
+* ***Avoid custom technology.*** Software that you write is
+ software that you have to maintain. Forever. Don't
+ succumb to NIH when there's a well supported public
+ solution that fits just as well (or even almost as well).
+
+* ***Use services.*** Software that you install is software
+ that you have to operate. From the moment it's activated,
+ someone will be taking regular time out of their schedule
+ to perform maintenance, troubleshoot problems, and
+ install upgrades. Don't succumb to NHH (not hosted here)
+ when there's a public service available that will do the
+ job better.
+
+It's not that new technology should _never_ be introduced,
+but it should be done with rational defensiveness, and with
+a critical eye in how it'll fit into an evolving (and
+hopefully ever-improving) architecture.
+
+## Nothing left to add, nothing left to take away (#nothing-left-to-add-or-take-away)
+
+Antoine de Saint Exupéry, a French poet and pioneering
+aviator, had this to say on the subject:
+
+> It seems that perfection is reached not when there is
+> nothing left to add, but when there is nothing left to
+> take away.
+
+!fig src="/assets/minimalism/sea.jpg" caption="Nothing left to add. Nothing left to take away."
+
+Most of us can benefit from architecture that's a little
+simpler, a little more conservative, and a little more
+directed. Only by concertedly building a minimal stack
+that's stable and nearly perfectly operable can we maximize
+our ability to push forward with new products and ideas.
+
+[kiss]: https://en.wikipedia.org/wiki/KISS_principle
diff --git a/content/articles/newsletters.md b/content/articles/newsletters.md
new file mode 100644
index 000000000..61d26054f
--- /dev/null
+++ b/content/articles/newsletters.md
@@ -0,0 +1,195 @@
+---
+title: "Pseudo-HTML and Pidgin CSS: Building an Email Newsletter"
+published_at: 2017-08-02T14:52:31Z
+hook: Building a toolchain for sending a newsletter, and
+ the dismal state of HTML and CSS in email.
+---
+
+After a recent trip to Portland, I decided to try writing a
+newsletter. I don't post on Facebook or other social very
+often, and don't otherwise do a good job of staying in
+touch with friends and family, so I'm always on the lookout
+for ways to develop better habits to help address that.
+I've been a subscriber to a few high quality newsletters
+from other people for years and reading them is something I
+really enjoy. Publishing my own seemed like a worthwhile
+experiment to the end of of doing a better job of staying
+connected.
+
+As one of the holdouts who refuses to move to a centralized
+publishing service (or publish content exclusively to a
+social network), I also like the idea of supporting the
+independent web. Bloggers used to have a powerful
+distribution channel in the form of RSS, and although the
+technology still exists today, but it's been fading for
+years, with more people moving exclusively towards their
+favorite social platform for content discovery. Email is a
+flawed technology in many ways, but it's one of the few
+communication channels that every connected person in the
+world will reliably have, and it fully supports sending
+rich media (images, links, videos, quotes, etc.) over open
+standards, and even content more than 140 characters long.
+
+There's nothing revelatory here, but I'll go through a few
+of the tools and services that I used, and some of the
+surprises along the way.
+
+## The right shape of service (#service)
+
+Initially I assumed that the best way to go would be
+through one of the many newsletter services like MailChimp
+or TinyLetter. Maybe it is, but in all the cases I looked
+at they either wanted to paste a branded footer on the end
+of everything you send, have you use a horrible WYSIWYG
+editor, or both; I also wanted to archive old copies on the
+web somewhere, and it was looking like I'd have to reinvent
+a custom layer on top of whatever service I ended up using.
+
+I sure wasn't about to start sending email myself, so I
+still wanted to use a service, but kept looking for one
+that exposed the right primitives. I wanted pixel perfect
+control over the visuals, but to fully offload subscription
+management to someone else.
+
+After a little more Googling I discovered that Mailgun
+offered an API for mailing lists. I've been using Mailgun
+for sending mail for years at Heroku, then at Stripe, and
+their service has been well-designed and reliable. Even
+better, at the email volumes that I'll be working with I'm
+unlikely to ever leave the free tier, and even beyond that
+their prices are good. I poked around their control panel
+for a while and experimented by sending a few messages to
+myself with their Go SDK, and it was off to the races.
+
+``` go
+// Sample code working with the Mailgun SDK -- build a
+// message, add HTML and plaintext content, and send away.
+
+mg := mailgun.NewMailgun(mailDomain, conf.MailgunAPIKey, "")
+
+message := mailgun.NewMessage(
+ fromAddress,
+ fmt.Sprintf("Passages & Glass %s — %s",
+ passage.Issue, passage.Title)
+ passage.ContentRaw,
+ recipient)
+message.SetReplyTo(replyToAddress)
+message.SetHtml(html)
+
+resp, _, err := mg.Send(message)
+if err != nil {
+ log.Fatal(err)
+}
+log.Printf(`Sent to: %s (response: "%s")`, recipient, resp)
+```
+
+## Pidgin CSS (#css)
+
+Over the last few decades, we've had pretty good success in
+standardizing how HTML and CSS are rendered across
+browsers, and during the time some huge victories were won.
+Progressively more sophisticated tests like Acid2 and Acid3
+dragged browsers up to spec and established widespread
+consistency in how they render the same code. Even
+straggers like IE that most had deemed hopeless were made
+to fall in line. I'd naively assumed that this was a war
+that had long since been won, only to realize that on the
+email front, the battle's still going, and has been
+quagmired for years.
+
+Email clients are a million miles away from rendering
+anything that's even remotely compliant with anything, and
+they're all uncompliant in their own exotic ways. Some
+clients are better than others, and somewhat ironically the
+companies that we tend to think of as the most advanced in
+the world are some of the most regressive. If you threw
+[Acid2][acid2] at Google Mail, you'd be lucky to see a lone
+yellow pixel on screen.
+
+The newsletter industry has dealt with this less than
+optimal state of affairs by developing a form of "pidgin
+CSS" made up of the lowest common denominator of what the
+world's diverse set of clients will handle. [Campaign
+Monitor's CSS support matrix][email-css] does a good job of
+showing just how divergent (and underwhelming) feature
+support is between clients. Best practice is to keep HTML
+email as basic as possible. Fancy CSS keywords like `float`
+are best avoided, anything developed this decade like
+`flex` and `grid` are totally out, and `
` is still
+the state of the art when it comes to building more complex
+layouts.
+
+I found that everything beyond the most trivially basic CSS
+usually caused problems in at least one mail client (often
+Google Mail):
+
+* `