Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build/db_schemas/scd.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@
"upto-v3.0.0-add_inverted_indices.sql": importstr "scd/upto-v3.0.0-add_inverted_indices.sql",
"upto-v3.1.0-create_uss_availability.sql": importstr "scd/upto-v3.1.0-create_uss_availability.sql",
"upto-v3.2.0-add_ovn_columns.sql": importstr "scd/upto-v3.2.0-add_ovn_columns.sql",
"upto-v3.3.0-add_locks.sql": importstr "scd/upto-v3.3.0-add_locks.sql",
},
}
5 changes: 5 additions & 0 deletions build/db_schemas/scd/downfrom-v3.3.0-remove_locks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
DROP TABLE IF EXISTS scd_locks;

UPDATE schema_versions
SET schema_version = 'v3.2.0'
WHERE onerow_enforcer = TRUE;
9 changes: 9 additions & 0 deletions build/db_schemas/scd/upto-v3.3.0-add_locks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
CREATE TABLE IF NOT EXISTS scd_locks (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to add a column specifying the instance that created the lock? That might facilitate debugging if required.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, as we just do a select for update. We could add a column and update it, but that would mean more 'work' in the transaction (we now have to commit something).

And as it would be in the transaction, we would only see the value when it's committed (so too late).

key INT64 PRIMARY KEY
);

INSERT INTO scd_locks (key) VALUES (0);

UPDATE schema_versions
SET schema_version = 'v3.3.0'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering whether this should be a major version bump: an older schema with a newer image will appear to work, however the locking won't be effective on the DSS instances with the older images.

Do I understand correctly that this will have an impact only on performances? Or will it have an impact on the correctness of the results? If the former: probably not needed to major version bump. But not if the latter.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't work if the option is used, but only in that case, so I'm not sure about the answer.

I would say that as it's only an experimental one, that safe to keep it as minor, as it shouldn't be required for normal usage.

WHERE onerow_enforcer = TRUE;
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
DROP TABLE IF EXISTS scd_locks;

UPDATE schema_versions set schema_version = 'v1.0.0' WHERE onerow_enforcer = TRUE;
7 changes: 7 additions & 0 deletions build/db_schemas/yugabyte/scd/upto-v1.1.0-add_locks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CREATE TABLE IF NOT EXISTS scd_locks (
key BIGINT PRIMARY KEY
);

INSERT INTO scd_locks (key) VALUES (0);

UPDATE schema_versions set schema_version = 'v1.1.0' WHERE onerow_enforcer = TRUE;
12 changes: 12 additions & 0 deletions build/dev/read_scd_lock_mode.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

set -eo pipefail

# Retrieve token from dummy OAuth server
ACCESS_TOKEN=$(curl --silent \
"http://localhost:8085/token?grant_type=client_credentials&scope=interuss.pool_status.read&intended_audience=localhost&issuer=localhost&sub=check_scd" \
| python extract_json_field.py 'access_token')

curl --silent -X GET \
"http://localhost:8082/aux/v1/configuration/scd_lock_mode" \
-H "Authorization: Bearer ${ACCESS_TOKEN}" -H "Content-Type: application/json"
1 change: 0 additions & 1 deletion build/dev/read_version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,3 @@ ACCESS_TOKEN=$(curl --silent \
curl --silent -X GET \
"http://localhost:8082/versions/local.test.identity" \
-H "Authorization: Bearer ${ACCESS_TOKEN}" -H "Content-Type: application/json"

11 changes: 7 additions & 4 deletions cmds/core-service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ var (
jwksKeyIDs = flag.String("jwks_key_ids", "", "IDs of a set of key in a JWKS, separated by commas")
keyRefreshTimeout = flag.Duration("key_refresh_timeout", 1*time.Minute, "Timeout for refreshing keys for JWT verification")
jwtAudiences = flag.String("accepted_jwt_audiences", "", "comma-separated acceptable JWT `aud` claims")

scdGlobalLock = flag.Bool("enable_scd_global_lock", false, "Experimental: Use a global lock when working with SCD subscriptions. Reduce global throughput but improve throughput with lot of subscriptions in the same areas.")
)

const (
Expand Down Expand Up @@ -106,7 +108,7 @@ func createKeyResolver() (auth.KeyResolver, error) {
}
}

func createAuxServer(ctx context.Context, locality string, publicEndpoint string, logger *zap.Logger) (*aux.Server, error) {
func createAuxServer(ctx context.Context, locality string, publicEndpoint string, scdGlobalLock bool, logger *zap.Logger) (*aux.Server, error) {
connectParameters := flags.ConnectParameters()
connectParameters.DBName = "aux"
datastore, err := datastore.Dial(ctx, connectParameters)
Expand Down Expand Up @@ -138,7 +140,7 @@ func createAuxServer(ctx context.Context, locality string, publicEndpoint string
return nil, stacktrace.Propagate(err, "Unable to store current metadata")
}

return &aux.Server{Store: auxStore, Locality: locality}, nil
return &aux.Server{Store: auxStore, Locality: locality, ScdGlobalLock: scdGlobalLock}, nil
}

func createRIDServers(ctx context.Context, locality string, logger *zap.Logger) (*rid_v1.Server, *rid_v2.Server, error) {
Expand Down Expand Up @@ -200,7 +202,7 @@ func createSCDServer(ctx context.Context, logger *zap.Logger) (*scd.Server, erro
return nil, stacktrace.Propagate(err, "Failed to connect to strategic conflict detection database; verify your database configuration is current with https://github.com/interuss/dss/tree/master/build#upgrading-database-schemas")
}

scdStore, err := scdc.NewStore(ctx, datastore)
scdStore, err := scdc.NewStore(ctx, datastore, *scdGlobalLock)
if err != nil {
// TODO: More robustly detect failure to create SCD server is due to a problem that may be temporary
if strings.Contains(err.Error(), "connect: connection refused") || strings.Contains(err.Error(), "database \"scd\" does not exist") {
Expand Down Expand Up @@ -233,6 +235,7 @@ func RunHTTPServer(ctx context.Context, ctxCanceler func(), address, locality st
logger.Info("version", zap.Any("version", version.Current()))
logger.Info("build", zap.Any("description", build.Describe()))
logger.Info("config", zap.Bool("scd", *enableSCD))
logger.Info("config", zap.Bool("scdGlobalLock", *scdGlobalLock))

if len(*jwtAudiences) == 0 {
// TODO: Make this flag required once all parties can set audiences
Expand All @@ -250,7 +253,7 @@ func RunHTTPServer(ctx context.Context, ctxCanceler func(), address, locality st
)

// Initialize aux
auxV1Server, err = createAuxServer(ctx, locality, *publicEndpoint, logger)
auxV1Server, err = createAuxServer(ctx, locality, *publicEndpoint, *scdGlobalLock, logger)
if err != nil {
return stacktrace.Propagate(err, "Failed to create aux server")
}
Expand Down
2 changes: 1 addition & 1 deletion cmds/db-manager/cleanup/evict.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ func getSCDStore(ctx context.Context) (*scdc.Store, error) {
return nil, fmt.Errorf("failed to connect to SCD database with %+v: %w", logParams, err)
}

scdStore, err := scdc.NewStore(ctx, datastore)
scdStore, err := scdc.NewStore(ctx, datastore, false)
if err != nil {
return nil, fmt.Errorf("failed to create strategic conflict detection store with %+v: %w", connectParameters, err)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
locals {
rid_db_schema = var.desired_rid_db_version == "latest" ? (var.datastore_type == "cockroachdb" ? "4.0.0" : "1.0.1") : var.desired_rid_db_version
scd_db_schema = var.desired_scd_db_version == "latest" ? (var.datastore_type == "cockroachdb" ? "3.2.0" : "1.0.1") : var.desired_scd_db_version
scd_db_schema = var.desired_scd_db_version == "latest" ? (var.datastore_type == "cockroachdb" ? "3.3.0" : "1.1.0") : var.desired_scd_db_version
aux_db_schema = var.desired_aux_db_version == "latest" ? "1.1.0" : var.desired_aux_db_version
}
4 changes: 2 additions & 2 deletions deploy/services/helm-charts/dss/templates/schema-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
{{- $jobVersion := .Release.Revision -}} {{/* Jobs template definition is immutable, using the revision in the name forces the job to be recreated at each helm upgrade. */}}

{{- $waitForDatastore := include "init-container-wait-for-http" (dict "serviceName" "cockroachdb" "url" (printf "http://%s:8080/health" $datastoreHost)) -}}
{{- $schemas := dict "rid" "4.0.0" "scd" "3.2.0" "aux_" "1.1.0" }}
{{- $schemas := dict "rid" "4.0.0" "scd" "3.3.0" "aux_" "1.1.0" }}

{{- if .Values.yugabyte.enabled }}
{{- $waitForDatastore = include "init-container-wait-for-http" (dict "serviceName" "yb-tserver" "url" (printf "http://%s:9000/status" $datastoreHost)) -}}
{{- $schemas = dict "rid" "1.0.1" "scd" "1.0.1" "aux_" "1.1.0" }}
{{- $schemas = dict "rid" "1.0.1" "scd" "1.1.0" "aux_" "1.1.0" }}
{{- end -}}

{{- range $service, $schemaVersion := $schemas }}
Expand Down
2 changes: 1 addition & 1 deletion deploy/services/tanka/examples/minikube/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ local metadata = metadataBase {
enable: true,
image: 'docker.io/interuss-local/dss:latest',
desired_rid_db_version: '1.0.1',
desired_scd_db_version: '1.0.1',
desired_scd_db_version: '1.1.0',
desired_aux_db_version: '1.1.0',
},
evict+: {
Expand Down
2 changes: 1 addition & 1 deletion deploy/services/tanka/examples/minimum/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ local metadata = metadataBase {
enable: false, // <-- this boolean value is VAR_ENABLE_SCHEMA_MANAGER
image: 'VAR_DOCKER_IMAGE_NAME',
desired_rid_db_version: '4.0.0',
desired_scd_db_version: '3.2.0',
desired_scd_db_version: '3.3.0',
desired_aux_db_version: '1.1.0',
},
prometheus+: {
Expand Down
2 changes: 1 addition & 1 deletion deploy/services/tanka/examples/schema_manager/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ local metadata = metadataBase {
enable: true, // <-- this boolean value is VAR_ENABLE_SCHEMA_MANAGER
image: 'VAR_DOCKER_IMAGE_NAME',
desired_rid_db_version: '4.0.0',
desired_scd_db_version: '3.2.0',
desired_scd_db_version: '3.3.0',
desired_aux_db_version: '1.1.0',
},
};
Expand Down
2 changes: 1 addition & 1 deletion deploy/services/tanka/metadata_base.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
enable: false, // NB: Automatically enabled if should_init is set to true.
image: error 'must specify image',
desired_rid_db_version: '4.0.0',
desired_scd_db_version: '3.2.0',
desired_scd_db_version: '3.3.0',
desired_aux_db_version: '1.1.0',
},
evict: {
Expand Down
Binary file added docs/assets/perfs_scd_lock_notoverlapping.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/assets/perfs_scd_lock_overlapping.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/operations/.nav.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ nav:
- "Pooling (CockroachDB)": pooling-crdb.md
- "Monitoring": monitoring.md
- "Migrations": migrations.md
- "Performances": performances.md
- "Troubleshooting": troubleshooting.md
4 changes: 4 additions & 0 deletions docs/operations/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ See [Leaving a pool](pooling.md#leaving-a-pool)

See [Monitoring](monitoring.md)

## Performances

See [Performances](performances.md)

## Troubleshooting

See [Troubleshooting](troubleshooting.md)
30 changes: 30 additions & 0 deletions docs/operations/performances.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Performances

## The SCD global lock option

!!! danger
All DSS instances in a DSS pool must use the same value for this option. Mixing will result in dramatically lower performance.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO there isn't enough safeguards or mitigation that this does not happen. At least IMO we want some visibility, e.g. expose this configuration flag on the _aux interface?
cc @BenjaminPelletier

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From InterUSS weekly:

  • solution to this should not take a large effort
  • failing fast is OK
  • addressing this through doc only is OK
  • exposing through aux interface is OK

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the info in the aux interface


You can use the `/aux/v1/configuration/scd_lock_mode` endpoint to retrive the current value for a specifc DSS instance.

It has been reported in issue [#1311](https://github.com/interuss/dss/issues/1311) that creating a lot of overlapping operational intents may increase the datastore load in a way that creates timeouts.

By default, the code will try to lock on required subscriptions when working on operational intents, and having too many of them may lead to issues.

A solution to that is to switch to a global lock, that is just globally locking operational intents operations, regardless of subscriptions.

This will result in lower general throughput for operational intents that don't overlap, as only one of them can be processed at a time, but better performance in the issue's case as lock acquisition is simpler.

You should enable this option depending on your DSS usage/use case and what you want to maximize:
* If you have non-overlapping traffic and maximum global throughput, don't enable this flag
* If you have overlapping traffic and don't need high global throughput, enable this flag

The following graphs show example throughput without (on the left) and with the flag (on the right). This has been run on a local machine; on a real deployment you can expect lower performance (due to various latency), but similar relative numbers.

All graphs have been generated with the [loadtest present in the monitoring repository](https://github.com/interuss/monitoring/blob/main/monitoring/loadtest/README.md) using `SCD.py`.

![](../assets/perfs_scd_lock_overlapping.png)
*Overlapping requests. Notice the huge spikes on the left, as the datastore struggles to acquire locks.*

![](../assets/perfs_scd_lock_notoverlapping.png)
*Non-overlapping requests. Notice the reduction of performance on the right, with a single lock.*
29 changes: 29 additions & 0 deletions interfaces/aux_/aux_.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ components:
items:
type: string

SCDLockModeResponse:
type: object
properties:
global_lock:
description: The value of the 'enable_scd_global_lock' option for this DSS instance
type: boolean

paths:
/aux/v1/version:
get:
Expand Down Expand Up @@ -324,6 +331,28 @@ paths:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
/aux/v1/configuration/scd_lock_mode:
get:
summary: Return the value of the 'enable_scd_global_lock' lock option. May be used to ensure all participants in a pool are using the same value.
operationId: getScdLockMode
tags: [ dss ]
security:
- Auth:
- interuss.pool_status.read
responses:
'200':
description: The information is successfully returned.
content:
application/json:
schema:
$ref: '#/components/schemas/SCDLockModeResponse'
'501':
description: >-
The server has not implemented this operation.
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
security:
- Auth:
- dss.read.identification_service_areas
Expand Down
25 changes: 24 additions & 1 deletion pkg/api/auxv1/interface.gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ import (
)

var (
InterussPoolStatusHeartbeatWriteScope = api.RequiredScope("interuss.pool_status.heartbeat.write")
InterussPoolStatusReadScope = api.RequiredScope("interuss.pool_status.read")
DssWriteIdentificationServiceAreasScope = api.RequiredScope("dss.write.identification_service_areas")
InterussPoolStatusHeartbeatWriteScope = api.RequiredScope("interuss.pool_status.heartbeat.write")
DssReadIdentificationServiceAreasScope = api.RequiredScope("dss.read.identification_service_areas")
GetVersionSecurity = []api.AuthorizationOption{}
ValidateOauthSecurity = []api.AuthorizationOption{
Expand Down Expand Up @@ -37,6 +37,11 @@ var (
}
GetAcceptedCAsSecurity = []api.AuthorizationOption{}
GetInstanceCAsSecurity = []api.AuthorizationOption{}
GetScdLockModeSecurity = []api.AuthorizationOption{
{
"Auth": {InterussPoolStatusReadScope},
},
}
)

type GetVersionRequest struct {
Expand Down Expand Up @@ -177,6 +182,21 @@ type GetInstanceCAsResponseSet struct {
Response500 *api.InternalServerErrorBody
}

type GetScdLockModeRequest struct {
// The result of attempting to authorize this request
Auth api.AuthorizationResult
}
type GetScdLockModeResponseSet struct {
// The information is successfully returned.
Response200 *SCDLockModeResponse

// The server has not implemented this operation.
Response501 *ErrorResponse

// Auto-generated internal server error response
Response500 *api.InternalServerErrorBody
}

type Implementation interface {
// Queries the version of the DSS.
GetVersion(ctx context.Context, req *GetVersionRequest) GetVersionResponseSet
Expand All @@ -198,4 +218,7 @@ type Implementation interface {

// Current certificates of certificate authorities (CAs) that signed the node certificates for this DSS instance. May return more that one certificate (e.g. for rotations). Other DSS instances in the pool should accept node certificates signed by these CAs.
GetInstanceCAs(ctx context.Context, req *GetInstanceCAsRequest) GetInstanceCAsResponseSet

// Return the value of the 'enable_scd_global_lock' lock option. May be used to ensure all participants in a pool are using the same value.
GetScdLockMode(ctx context.Context, req *GetScdLockModeRequest) GetScdLockModeResponseSet
}
32 changes: 31 additions & 1 deletion pkg/api/auxv1/server.gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,35 @@ func (s *APIRouter) GetInstanceCAs(exp *regexp.Regexp, w http.ResponseWriter, r
api.WriteJSON(w, 500, api.InternalServerErrorBody{ErrorMessage: "Handler implementation did not set a response"})
}

func (s *APIRouter) GetScdLockMode(exp *regexp.Regexp, w http.ResponseWriter, r *http.Request) {
var req GetScdLockModeRequest

// Authorize request
req.Auth = s.Authorizer.Authorize(w, r, GetScdLockModeSecurity)

// Call implementation
ctx, cancel := context.WithCancel(r.Context())
defer cancel()
response := s.Implementation.GetScdLockMode(ctx, &req)

// Write response to client
if response.Response200 != nil {
api.WriteJSON(w, 200, response.Response200)
return
}
if response.Response501 != nil {
api.WriteJSON(w, 501, response.Response501)
return
}
if response.Response500 != nil {
api.WriteJSON(w, 500, response.Response500)
return
}
api.WriteJSON(w, 500, api.InternalServerErrorBody{ErrorMessage: "Handler implementation did not set a response"})
}

func MakeAPIRouter(impl Implementation, auth api.Authorizer) APIRouter {
router := APIRouter{Implementation: impl, Authorizer: auth, Routes: make([]*api.Route, 7)}
router := APIRouter{Implementation: impl, Authorizer: auth, Routes: make([]*api.Route, 8)}

pattern := regexp.MustCompile("^/aux/v1/version$")
router.Routes[0] = &api.Route{Method: http.MethodGet, Pattern: pattern, Handler: router.GetVersion}
Expand All @@ -290,5 +317,8 @@ func MakeAPIRouter(impl Implementation, auth api.Authorizer) APIRouter {
pattern = regexp.MustCompile("^/aux/v1/configuration/ca_certs$")
router.Routes[6] = &api.Route{Method: http.MethodGet, Pattern: pattern, Handler: router.GetInstanceCAs}

pattern = regexp.MustCompile("^/aux/v1/configuration/scd_lock_mode$")
router.Routes[7] = &api.Route{Method: http.MethodGet, Pattern: pattern, Handler: router.GetScdLockMode}

return router
}
5 changes: 5 additions & 0 deletions pkg/api/auxv1/types.gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,8 @@ type CAsResponse struct {
// A list of certificates, each in PEM format.
Cas *[]string `json:"CAs,omitempty"`
}

type SCDLockModeResponse struct {
// The value of the 'enable_scd_global_lock' option for this DSS instance
GlobalLock *bool `json:"global_lock,omitempty"`
}
4 changes: 2 additions & 2 deletions pkg/api/scdv1/interface.gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import (
)

var (
UtmStrategicCoordinationScope = api.RequiredScope("utm.strategic_coordination")
UtmConformanceMonitoringSaScope = api.RequiredScope("utm.conformance_monitoring_sa")
UtmAvailabilityArbitrationScope = api.RequiredScope("utm.availability_arbitration")
UtmConstraintManagementScope = api.RequiredScope("utm.constraint_management")
UtmConformanceMonitoringSaScope = api.RequiredScope("utm.conformance_monitoring_sa")
UtmStrategicCoordinationScope = api.RequiredScope("utm.strategic_coordination")
UtmConstraintProcessingScope = api.RequiredScope("utm.constraint_processing")
QueryOperationalIntentReferencesSecurity = []api.AuthorizationOption{
{
Expand Down
11 changes: 11 additions & 0 deletions pkg/aux_/scd_lock_mode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package aux

import (
"context"

restapi "github.com/interuss/dss/pkg/api/auxv1"
)

func (a *Server) GetScdLockMode(ctx context.Context, req *restapi.GetScdLockModeRequest) restapi.GetScdLockModeResponseSet {
return restapi.GetScdLockModeResponseSet{Response200: &restapi.SCDLockModeResponse{GlobalLock: &a.ScdGlobalLock}}
}
Loading
Loading