← Home

Alerting on AWS Security Hub Notifications with OpsGenie

In my previous post [0], I wrote about meeting CIS AWS Foundations using Terraform and some scripts.

However, many of the CIS benchmark controls enforce alerting on suspicious or unusual actions, for example, using the root user, or changing firewall rules.

I use CDK to configure all of the required alarms [1].

const cisMetricNamespace = "cis"
const snsMetricNamespace = "sns"

func addAlarms(stack awscdk.Stack) {
	alarmTopic := addAlarmSNSTopic(stack)
	billingAlarmTopic := addBillingTopic(stack)
	lg := awslogs.LogGroup_FromLogGroupName(stack, jsii.String("cloudTrailLogGroup"), jsii.String("aws-controltower/CloudTrailLogs"))

	// CIS Metrics and Alarms
	addCISMetricAndAlarm(stack, alarmTopic, lg, 500, "UnauthorizedApiCalls", `{ ($.errorCode = "*UnauthorizedOperation") || ($.errorCode = "AccessDenied*") }`)
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "NoMfaConsoleLogins", "{ $.userIdentity.sessionContext.attributes.mfaAuthenticated != \"true\" && $.userIdentity.invokedBy = \"signin.amazonaws.com\" }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "RootAccountLogins", "{ $.userIdentity.type = \"Root\" && $.userIdentity.invokedBy NOT EXISTS && $.eventType != \"AwsServiceEvent\" }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 5, "IamPolicyChanges", "{($.eventName=DeleteGroupPolicy)||($.eventName=DeleteRolePolicy)||($.eventName=DeleteUserPolicy)||($.eventName=PutGroupPolicy)||($.eventName=PutRolePolicy)||($.eventName=PutUserPolicy)||($.eventName=CreatePolicy)||($.eventName=DeletePolicy)||($.eventName=CreatePolicyVersion)||($.eventName=DeletePolicyVersion)||($.eventName=AttachRolePolicy)||($.eventName=DetachRolePolicy)||($.eventName=AttachUserPolicy)||($.eventName=DetachUserPolicy)||($.eventName=AttachGroupPolicy)||($.eventName=DetachGroupPolicy)}")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "CloudTrailConfigurationChanges", "{ ($.eventName = CreateTrail) ||($.eventName = UpdateTrail) || ($.eventName = DeleteTrail) || ($.eventName = StartLogging) || ($.eventName = StopLogging) }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 100, "FailedConsoleLogins", "{ ($.eventName = ConsoleLogin) && ($.errorMessage = \"Failed authentication\") }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "DisabledOrDeletedCmks", "{($.eventSource = kms.amazonaws.com) && (($.eventName=DisableKey)||($.eventName=ScheduleKeyDeletion))}")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 5, "S3BucketPolicyChanges", "{ ($.eventSource = s3.amazonaws.com) && (($.eventName = PutBucketAcl) || ($.eventName = PutBucketPolicy) || ($.eventName = PutBucketCors) || ($.eventName = PutBucketLifecycle) || ($.eventName = PutBucketReplication) || ($.eventName = DeleteBucketPolicy) || ($.eventName = DeleteBucketCors) || ($.eventName = DeleteBucketLifecycle) || ($.eventName = DeleteBucketReplication)) }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "AwsConfigChanges", "{($.eventSource = config.amazonaws.com) && (($.eventName=StopConfigurationRecorder)||($.eventName=DeleteDeliveryChannel)||($.eventName=PutDeliveryChannel)||($.eventName=PutConfigurationRecorder))}")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "SecurityGroupChanges", "{ ($.eventName = AuthorizeSecurityGroupIngress) || ($.eventName = AuthorizeSecurityGroupEgress) || ($.eventName = RevokeSecurityGroupIngress) || ($.eventName = RevokeSecurityGroupEgress) || ($.eventName = CreateSecurityGroup) || ($.eventName = DeleteSecurityGroup)}")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "NetworkAccessControlListChanges", "{ ($.eventName = CreateNetworkAcl) || ($.eventName = CreateNetworkAclEntry) || ($.eventName = DeleteNetworkAcl) || ($.eventName = DeleteNetworkAclEntry) || ($.eventName = ReplaceNetworkAclEntry) || ($.eventName = ReplaceNetworkAclAssociation) }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "NetworkGatewayChanges", "{ ($.eventName = CreateCustomerGateway) || ($.eventName = DeleteCustomerGateway) || ($.eventName = AttachInternetGateway) || ($.eventName = CreateInternetGateway) || ($.eventName = DeleteInternetGateway) || ($.eventName = DetachInternetGateway) }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "RouteTableChanges", "{ ($.eventName = CreateRoute) || ($.eventName = CreateRouteTable) || ($.eventName = ReplaceRoute) || ($.eventName = ReplaceRouteTableAssociation) || ($.eventName = DeleteRouteTable) || ($.eventName = DeleteRoute) || ($.eventName= DisassociateRouteTable) }")
	addCISMetricAndAlarm(stack, alarmTopic, lg, 1, "VPCChanges", "{ ($.eventName = CreateVpc) || ($.eventName = DeleteVpc) || ($.eventName = ModifyVpcAttribute) || ($.eventName = AcceptVpcPeeringConnection) || ($.eventName = CreateVpcPeeringConnection) || ($.eventName = DeleteVpcPeeringConnection) || ($.eventName = RejectVpcPeeringConnection) || ($.eventName = AttachClassicLinkVpc) || ($.eventName = DetachClassicLinkVpc) || ($.eventName = DisableVpcClassicLink) || ($.eventName = EnableVpcClassicLink) }")

	// Billing Alarms
	addBillingAlarm(stack, billingAlarmTopic, "SMSMonthToDateSpentUSD")
}

func addBillingAlarm(stack awscdk.Stack, topic awssns.Topic, metricName string) {
	// Add alarm.
	m := awscloudwatch.NewMetric(&awscloudwatch.MetricProps{
		MetricName: jsii.String(metricName),
		Namespace:  jsii.String(snsMetricNamespace),
	})
	alarm := awscloudwatch.NewAlarm(stack, jsii.String(metricName+"Alarm"), &awscloudwatch.AlarmProps{
		AlarmDescription:   jsii.String("SNS billing metric alarm."),
		AlarmName:          jsii.String(metricName + "Alarm"),
		Metric:             m,                                                                   // The metric is...
		Statistic:          jsii.String("p90"),                                                  // The 90th percentile, 90% of values are lower than this value, 10% higher
		Period:             awscdk.Duration_Minutes(jsii.Number(1)),                             // 5 minute period.
		EvaluationPeriods:  jsii.Number(1),                                                      // If, in the last "1" of those periods
		DatapointsToAlarm:  jsii.Number(1),                                                      // There's more than one datapoint
		ComparisonOperator: awscloudwatch.ComparisonOperator_GREATER_THAN_OR_EQUAL_TO_THRESHOLD, // Where the metric >= to
		Threshold:          jsii.Number(450),                                                    // The cost in dollars that we alert on if we breach
		ActionsEnabled:     jsii.Bool(true),                                                     // Do the actions.
		TreatMissingData:   awscloudwatch.TreatMissingData_NOT_BREACHING,                        // And shut down the alarm after 5 minutes of no data. The alert will have been triggered.
	})
	alarm.AddAlarmAction(awscloudwatchactions.NewSnsAction(topic))
}

func addBillingTopic(stack awscdk.Stack) awssns.Topic {
	kmsKey := awskms.Alias_FromAliasName(stack, jsii.String("snsManagedKey"), jsii.String("aws/sns"))
	kmsKey.AddToResourcePolicy(awsiam.NewPolicyStatement(&awsiam.PolicyStatementProps{
		Actions: &[]*string{
			jsii.String("kms:Decrypt"),
			jsii.String("kms:GenerateDataKey"),
		},
		Effect: awsiam.Effect_ALLOW,
		Principals: &[]awsiam.IPrincipal{
			awsiam.NewServicePrincipal(jsii.String("cloudwatch.amazonaws.com"), &awsiam.ServicePrincipalOpts{}),
		},
		Resources: &[]*string{jsii.String("*")},
	}), jsii.Bool(true))

	topic := awssns.NewTopic(stack, jsii.String("billingAlarmTopic"), &awssns.TopicProps{
		DisplayName: jsii.String("billingAlarmTopic"),
		MasterKey:   kmsKey,
	})
	topic.AddToResourcePolicy(awsiam.NewPolicyStatement(&awsiam.PolicyStatementProps{
		Actions: &[]*string{jsii.String("sns:Publish")},
		Effect:  awsiam.Effect_ALLOW,
		Principals: &[]awsiam.IPrincipal{
			awsiam.NewServicePrincipal(jsii.String("cloudwatch.amazonaws.com"), &awsiam.ServicePrincipalOpts{}),
		},
		Resources: &[]*string{topic.TopicArn()},
	}))
	awscdk.NewCfnOutput(stack, jsii.String("billingAlarmTopicArn"), &awscdk.CfnOutputProps{
		ExportName: jsii.String("billing-alarm-topic-arn"),
		Value:      jsii.String(*topic.TopicArn()),
	})
	awscdk.NewCfnOutput(stack, jsii.String("billingAlarmTopicName"), &awscdk.CfnOutputProps{
		ExportName: jsii.String("billing-alarm-topic-name"),
		Value:      jsii.String(*topic.TopicName()),
	})
	return topic
}

func addAlarmSNSTopic(stack awscdk.Stack) awssns.Topic {
	alarmEncryptionKey := awskms.NewKey(stack, jsii.String("alarmTopicKey"), &awskms.KeyProps{
		Alias:             jsii.String("alarmTopicKey"),
		Description:       jsii.String("Key to encrypt the alarm SNS topic"),
		EnableKeyRotation: jsii.Bool(true),
	})
	alarmEncryptionKey.AddToResourcePolicy(awsiam.NewPolicyStatement(&awsiam.PolicyStatementProps{
		Actions: &[]*string{
			jsii.String("kms:Decrypt"),
			jsii.String("kms:GenerateDataKey"),
		},
		Effect: awsiam.Effect_ALLOW,
		Principals: &[]awsiam.IPrincipal{
			awsiam.NewServicePrincipal(jsii.String("cloudwatch.amazonaws.com"), &awsiam.ServicePrincipalOpts{}),
		},
		Resources: &[]*string{jsii.String("*")},
	}), jsii.Bool(true))
	topic := awssns.NewTopic(stack, jsii.String("alarmTopic"), &awssns.TopicProps{
		DisplayName: jsii.String("alarmTopic"),
		MasterKey:   alarmEncryptionKey,
	})
	topic.AddToResourcePolicy(awsiam.NewPolicyStatement(&awsiam.PolicyStatementProps{
		Actions: &[]*string{jsii.String("sns:Publish")},
		Effect:  awsiam.Effect_ALLOW,
		Principals: &[]awsiam.IPrincipal{
			awsiam.NewServicePrincipal(jsii.String("cloudwatch.amazonaws.com"), &awsiam.ServicePrincipalOpts{}),
		},
		Resources: &[]*string{topic.TopicArn()},
	}))
	awscdk.NewCfnOutput(stack, jsii.String("alarmTopicArn"), &awscdk.CfnOutputProps{
		ExportName: jsii.String("alarm-topic-arn"),
		Value:      jsii.String(*topic.TopicArn()),
	})
	awscdk.NewCfnOutput(stack, jsii.String("alarmTopicName"), &awscdk.CfnOutputProps{
		ExportName: jsii.String("alarm-topic-name"),
		Value:      jsii.String(*topic.TopicName()),
	})
	return topic
}

func addCISMetricAndAlarm(stack awscdk.Stack, topic awssns.ITopic, logGroup awslogs.ILogGroup, countIn5Minutes int, name, pattern string) {
	awslogs.NewMetricFilter(stack, jsii.String(name), &awslogs.MetricFilterProps{
		LogGroup:        logGroup,
		MetricNamespace: jsii.String(cisMetricNamespace),
		MetricName:      jsii.String(name),
		FilterPattern:   awslogs.FilterPattern_Literal(&pattern),
		MetricValue:     jsii.String("1"),
	})

	// Add alarm.
	m := awscloudwatch.NewMetric(&awscloudwatch.MetricProps{
		MetricName: jsii.String(name),
		Namespace:  jsii.String(cisMetricNamespace),
	})
	alarm := awscloudwatch.NewAlarm(stack, jsii.String(name+"Alarm"), &awscloudwatch.AlarmProps{
		AlarmDescription:   jsii.String("CIS metric alarm."),
		AlarmName:          jsii.String(name + "Alarm"),
		Metric:             m,                                                                   // The metric is...
		Statistic:          jsii.String("sum"),                                                  // The sum of errors over a
		Period:             awscdk.Duration_Minutes(jsii.Number(5)),                             // 5 minute period.
		EvaluationPeriods:  jsii.Number(1),                                                      // If, in the last "1" of those periods
		DatapointsToAlarm:  jsii.Number(1),                                                      // There's more than one datapoint
		ComparisonOperator: awscloudwatch.ComparisonOperator_GREATER_THAN_OR_EQUAL_TO_THRESHOLD, // Where the metric >= to
		Threshold:          jsii.Number(float64(countIn5Minutes)),                               // The value of countIn5Minutes... then
		ActionsEnabled:     jsii.Bool(true),                                                     // Do the actions.
		TreatMissingData:   awscloudwatch.TreatMissingData_NOT_BREACHING,                        // And shut down the alarm after 5 minutes of no data. The alert will have been triggered.
	})
	alarm.AddAlarmAction(awscloudwatchactions.NewSnsAction(topic))
}

But… there’s no point having an alarm if no-one is taking action on the alarm being triggered, so the control also checks that when the alarm triggers, a notification is sent to an Amazon SNS Topic.

The first question a security auditor usually asks when they hear about this is what happens to the alerts, since if our system has been compromised, it’s important for us to act quickly to resolve the issue.

This is where OpsGenie comes in.

OpsGenie to the rescue

OpsGenie is an Atlassian product that helps teams to manage alerts.

In my teams, we’re usually responsible for building new product features, managing the service and infrastructure, and providing out-of-hours support and we have a rota so that a person is responsible for being “on-call” for a week at a time.

OpsGenie manages the rota for us, and automatically routes alerts to the person that’s currently on call.

There’s a mobile app to receive notifications, but OpsGenie will also make a phone call and send text messages so it’s really hard to miss a notification.

There’s also an audit trail of what happened with alerts, so you can see exactly when a notification was sent, and when it was acknowledged by the on-call team member, which helps when we’re establishing timelines of events.

Since it’s possible that someone might be out of mobile phone coverage and not receive a notification, OpsGenie also has an escalation feature - if the person who’s on call doesn’t answer, then you can configure OpsGenie to send it to a backup person or team.

As a general rule, people don’t like being woken up unless it’s for a good reason. So we also use OpsGenie to filter out noisy alerts completely, and only send high priority notifications when outside working hours - we can pick up lower priority alerts in the morning.

Setup

Since the team was already using Jira, there was no hassle around setting up new user accounts, or concerns about getting new suppliers on-boarded - it was just a case of adding OpsGenie to the existing Jira billing.

From Amazon SNS, it’s just a case of subscribing OpsGenie to the alarms SNS topic.

Summary

Applying a few scripts can automate the resolution of a number of CIS Foundation Benchmark controls, but you’ll need a process for managing alerts.

The alerts can be setup automatically with CDK, and used to trigger an out-of-hours support process via a webhook.