From d75b1de62bff639697e6257d2db26f866b5379e1 Mon Sep 17 00:00:00 2001 From: Alex Chantavy Date: Fri, 14 Jul 2023 13:16:49 -0700 Subject: [PATCH] #1210: EBSVolume => new data model, Allow node attr updates from multiple intel modules (#1214) See #1210 for full context. #1154 tried to solve this problem by updating the querybuilder but this was too complex and would not generalize well. This solution is simpler where we use different property classes for each API response so that we don't overwrite properties on a node set by another sync job. This PR can be reviewed commit-by-commit: - c0d9ac4cc35fd93e260964cfa41a813e2a2032a7 shows a repro of the error with a failing integration test. - facb63bcbac6b68eec0fd2ea3f6b0550ac40eb10 shows the solution using multiple classes. --------- Co-authored-by: Ramon Petgrave <32398091+ramonpetgrave64@users.noreply.github.com> --- cartography/intel/aws/ec2/instances.py | 4 +- cartography/intel/aws/ec2/volumes.py | 118 +++++------ cartography/intel/aws/util/arns.py | 18 ++ cartography/models/aws/ec2/volumes.py | 46 ++++- tests/data/aws/ec2/volumes.py | 4 +- .../intel/aws/ec2/test_ec2_volumes.py | 194 +++++++++++------- 6 files changed, 237 insertions(+), 147 deletions(-) create mode 100644 cartography/intel/aws/util/arns.py diff --git a/cartography/intel/aws/ec2/instances.py b/cartography/intel/aws/ec2/instances.py index 1c69eb06a8..87c7e32028 100644 --- a/cartography/intel/aws/ec2/instances.py +++ b/cartography/intel/aws/ec2/instances.py @@ -17,7 +17,7 @@ from cartography.models.aws.ec2.reservations import EC2ReservationSchema from cartography.models.aws.ec2.securitygroups import EC2SecurityGroupSchema from cartography.models.aws.ec2.subnets import EC2SubnetSchema -from cartography.models.aws.ec2.volumes import EBSVolumeSchema +from cartography.models.aws.ec2.volumes import EBSVolumeInstanceSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -273,7 +273,7 @@ def load_ec2_instance_ebs_volumes( ) -> None: load( neo4j_session, - EBSVolumeSchema(), + EBSVolumeInstanceSchema(), ebs_data, Region=region, AWS_ID=current_aws_account_id, diff --git a/cartography/intel/aws/ec2/volumes.py b/cartography/intel/aws/ec2/volumes.py index 6de03c7d42..3ad50f1186 100644 --- a/cartography/intel/aws/ec2/volumes.py +++ b/cartography/intel/aws/ec2/volumes.py @@ -6,7 +6,9 @@ import boto3 import neo4j +from cartography.client.core.tx import load from cartography.graph.job import GraphJob +from cartography.intel.aws.util.arns import build_arn from cartography.models.aws.ec2.volumes import EBSVolumeSchema from cartography.util import aws_handle_regions from cartography.util import timeit @@ -16,7 +18,7 @@ @timeit @aws_handle_regions -def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict]: +def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict[str, Any]]: client = boto3_session.client('ec2', region_name=region) paginator = client.get_paginator('describe_volumes') volumes: List[Dict] = [] @@ -26,90 +28,76 @@ def get_volumes(boto3_session: boto3.session.Session, region: str) -> List[Dict] def transform_volumes(volumes: List[Dict[str, Any]], region: str, current_aws_account_id: str) -> List[Dict[str, Any]]: + result = [] for volume in volumes: - volume['VolumeArn'] = f"arn:aws:ec2:{region}:{current_aws_account_id}:volume/{volume['VolumeId']}" - volume['CreateTime'] = str(volume['CreateTime']) - return volumes + attachments = volume.get('Attachments', []) + active_attachments = [a for a in attachments if a['State'] == 'attached'] + + volume_id = volume['VolumeId'] + raw_vol = ({ + 'Arn': build_arn('ec2', current_aws_account_id, 'volume', volume_id, region), + 'AvailabilityZone': volume['AvailabilityZone'], + 'CreateTime': volume['CreateTime'], + 'Encrypted': volume['Encrypted'], + 'Size': volume['Size'], + 'State': volume['State'], + 'OutpostArn': volume['OutpostArn'], + 'SnapshotId': volume['SnapshotId'], + 'Iops': volume['Iops'], + 'FastRestored': volume['FastRestored'], + 'MultiAttachEnabled': volume['MultiAttachEnabled'], + 'VolumeType': volume['VolumeType'], + 'VolumeId': volume_id, + 'KmsKeyId': volume['KmsKeyId'], + }) + + if not active_attachments: + result.append(raw_vol) + continue + + for attachment in active_attachments: + vol_with_attachment = raw_vol.copy() + vol_with_attachment['InstanceId'] = attachment['InstanceId'] + result.append(vol_with_attachment) + + return result @timeit def load_volumes( - neo4j_session: neo4j.Session, data: List[Dict], region: str, current_aws_account_id: str, update_tag: int, + neo4j_session: neo4j.Session, + ebs_data: List[Dict[str, Any]], + region: str, + current_aws_account_id: str, + update_tag: int, ) -> None: - ingest_volumes = """ - UNWIND $volumes_list as volume - MERGE (vol:EBSVolume{id: volume.VolumeId}) - ON CREATE SET vol.firstseen = timestamp() - SET vol.arn = volume.VolumeArn, - vol.lastupdated = $update_tag, - vol.availabilityzone = volume.AvailabilityZone, - vol.createtime = volume.CreateTime, - vol.encrypted = volume.Encrypted, - vol.size = volume.Size, - vol.state = volume.State, - vol.outpostarn = volume.OutpostArn, - vol.snapshotid = volume.SnapshotId, - vol.iops = volume.Iops, - vol.fastrestored = volume.FastRestored, - vol.multiattachenabled = volume.MultiAttachEnabled, - vol.type = volume.VolumeType, - vol.kmskeyid = volume.KmsKeyId, - vol.region=$Region - WITH vol - MATCH (aa:AWSAccount{id: $AWS_ACCOUNT_ID}) - MERGE (aa)-[r:RESOURCE]->(vol) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $update_tag - """ - - neo4j_session.run( - ingest_volumes, - volumes_list=data, - AWS_ACCOUNT_ID=current_aws_account_id, + load( + neo4j_session, + EBSVolumeSchema(), + ebs_data, Region=region, - update_tag=update_tag, + AWS_ID=current_aws_account_id, + lastupdated=update_tag, ) -def load_volume_relationships( - neo4j_session: neo4j.Session, - volumes: List[Dict[str, Any]], - aws_update_tag: int, -) -> None: - add_relationship_query = """ - MATCH (volume:EBSVolume{arn: $VolumeArn}) - WITH volume - MATCH (instance:EC2Instance{instanceid: $InstanceId}) - MERGE (volume)-[r:ATTACHED_TO_EC2_INSTANCE]->(instance) - ON CREATE SET r.firstseen = timestamp() - SET r.lastupdated = $aws_update_tag - """ - for volume in volumes: - for attachment in volume.get('Attachments', []): - if attachment['State'] != 'attached': - continue - neo4j_session.run( - add_relationship_query, - VolumeArn=volume['VolumeArn'], - InstanceId=attachment['InstanceId'], - aws_update_tag=aws_update_tag, - ) - - @timeit -def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict) -> None: +def cleanup_volumes(neo4j_session: neo4j.Session, common_job_parameters: Dict[str, Any]) -> None: GraphJob.from_node_schema(EBSVolumeSchema(), common_job_parameters).run(neo4j_session) @timeit def sync_ebs_volumes( - neo4j_session: neo4j.Session, boto3_session: boto3.session.Session, regions: List[str], - current_aws_account_id: str, update_tag: int, common_job_parameters: Dict, + neo4j_session: neo4j.Session, + boto3_session: boto3.session.Session, + regions: List[str], + current_aws_account_id: str, + update_tag: int, + common_job_parameters: Dict[str, Any], ) -> None: for region in regions: logger.debug("Syncing volumes for region '%s' in account '%s'.", region, current_aws_account_id) data = get_volumes(boto3_session, region) transformed_data = transform_volumes(data, region, current_aws_account_id) load_volumes(neo4j_session, transformed_data, region, current_aws_account_id, update_tag) - load_volume_relationships(neo4j_session, transformed_data, update_tag) cleanup_volumes(neo4j_session, common_job_parameters) diff --git a/cartography/intel/aws/util/arns.py b/cartography/intel/aws/util/arns.py new file mode 100644 index 0000000000..e6108b82c3 --- /dev/null +++ b/cartography/intel/aws/util/arns.py @@ -0,0 +1,18 @@ +from typing import Optional + + +def build_arn( + resource: str, + account: str, + typename: str, + name: str, + region: Optional[str] = None, + partition: Optional[str] = None, +) -> str: + if not partition: + # TODO: support partitions from others. Please file an issue on this if needed, would love to hear from you + partition = 'aws' + if not region: + # Some resources are present in all regions, e.g. IAM policies + region = "" + return f"arn:{partition}:{resource}:{region}:{account}:{typename}/{name}" diff --git a/cartography/models/aws/ec2/volumes.py b/cartography/models/aws/ec2/volumes.py index 2140f4fcd0..bb6925780c 100644 --- a/cartography/models/aws/ec2/volumes.py +++ b/cartography/models/aws/ec2/volumes.py @@ -13,10 +13,23 @@ @dataclass(frozen=True) class EBSVolumeNodeProperties(CartographyNodeProperties): + arn: PropertyRef = PropertyRef('Arn', extra_index=True) id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) region: PropertyRef = PropertyRef('Region', set_in_kwargs=True) lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) - deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + availabilityzone: PropertyRef = PropertyRef('AvailabilityZone') + createtime: PropertyRef = PropertyRef('CreateTime') + encrypted: PropertyRef = PropertyRef('Encrypted') + size: PropertyRef = PropertyRef('Size') + state: PropertyRef = PropertyRef('State') + outpostarn: PropertyRef = PropertyRef('OutpostArn') + snapshotid: PropertyRef = PropertyRef('SnapshotId') + iops: PropertyRef = PropertyRef('Iops') + fastrestored: PropertyRef = PropertyRef('FastRestored') + multiattachenabled: PropertyRef = PropertyRef('MultiAttachEnabled') + type: PropertyRef = PropertyRef('VolumeType') + kmskeyid: PropertyRef = PropertyRef('KmsKeyId') @dataclass(frozen=True) @@ -53,6 +66,9 @@ class EBSVolumeToEC2Instance(CartographyRelSchema): @dataclass(frozen=True) class EBSVolumeSchema(CartographyNodeSchema): + """ + EBS Volume properties as returned from the EBS Volume API response + """ label: str = 'EBSVolume' properties: EBSVolumeNodeProperties = EBSVolumeNodeProperties() sub_resource_relationship: EBSVolumeToAWSAccount = EBSVolumeToAWSAccount() @@ -61,3 +77,31 @@ class EBSVolumeSchema(CartographyNodeSchema): EBSVolumeToEC2Instance(), ], ) + + +@dataclass(frozen=True) +class EBSVolumeInstanceProperties(CartographyNodeProperties): + """ + EBS Volume properties as known by an EC2 instance. + The EC2 instance API response includes a `deleteontermination` field and the volume id. + """ + arn: PropertyRef = PropertyRef('Arn', extra_index=True) + id: PropertyRef = PropertyRef('VolumeId') + volumeid: PropertyRef = PropertyRef('VolumeId', extra_index=True) + lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True) + deleteontermination: PropertyRef = PropertyRef('DeleteOnTermination') + + +@dataclass(frozen=True) +class EBSVolumeInstanceSchema(CartographyNodeSchema): + """ + EBS Volume from EC2 Instance API response. This is separate from `EBSVolumeSchema` to prevent issue #1210. + """ + label: str = 'EBSVolume' + properties: EBSVolumeInstanceProperties = EBSVolumeInstanceProperties() + sub_resource_relationship: EBSVolumeToAWSAccount = EBSVolumeToAWSAccount() + other_relationships: OtherRelationships = OtherRelationships( + [ + EBSVolumeToEC2Instance(), + ], + ) diff --git a/tests/data/aws/ec2/volumes.py b/tests/data/aws/ec2/volumes.py index e3a29c03fa..6e08003e73 100644 --- a/tests/data/aws/ec2/volumes.py +++ b/tests/data/aws/ec2/volumes.py @@ -14,7 +14,7 @@ 'Size': 123, 'SnapshotId': 'sn-01', 'State': 'available', - 'VolumeId': 'v-01', + 'VolumeId': 'vol-0df', 'Iops': 123, 'VolumeType': 'standard', 'FastRestored': True, @@ -33,7 +33,7 @@ 'OutpostArn': 'arn1', 'Size': 123, 'State': 'available', - 'VolumeId': 'v-02', + 'VolumeId': 'vol-03', 'Iops': 123, 'SnapshotId': 'sn-02', 'VolumeType': 'standard', diff --git a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py index 705d2db3ec..b852581303 100644 --- a/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py +++ b/tests/integration/cartography/intel/aws/ec2/test_ec2_volumes.py @@ -3,94 +3,116 @@ import cartography.intel.aws.ec2.instances import cartography.intel.aws.ec2.volumes -import tests.data.aws.ec2.instances -import tests.data.aws.ec2.volumes from cartography.intel.aws.ec2.instances import sync_ec2_instances +from cartography.intel.aws.ec2.volumes import sync_ebs_volumes from tests.data.aws.ec2.instances import DESCRIBE_INSTANCES - +from tests.data.aws.ec2.volumes import DESCRIBE_VOLUMES +from tests.integration.cartography.intel.aws.common import create_test_account +from tests.integration.util import check_nodes +from tests.integration.util import check_rels TEST_ACCOUNT_ID = '000000000000' TEST_REGION = 'eu-west-1' TEST_UPDATE_TAG = 123456789 -def test_load_volumes(neo4j_session): +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes(mock_get_vols, neo4j_session): # Arrange - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) # Act - cartography.intel.aws.ec2.volumes.load_volumes( + sync_ebs_volumes( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) # Assert - expected_nodes = { - "v-01", "v-02", + assert check_nodes(neo4j_session, 'EBSVolume', ['arn']) == { + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-03',), + ('arn:aws:ec2:eu-west-1:000000000000:volume/vol-0df',), } - nodes = neo4j_session.run( - """ - MATCH (r:EBSVolume) RETURN r.id; - """, - ) - actual_nodes = {n['r.id'] for n in nodes} - - assert actual_nodes == expected_nodes - - -def test_load_volume_to_account_rels(neo4j_session): + # Assert + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + (TEST_ACCOUNT_ID, 'vol-03'), + (TEST_ACCOUNT_ID, 'vol-0df'), + } - # Arrange: Create Test AWSAccount - neo4j_session.run( - """ - MERGE (aws:AWSAccount{id: $aws_account_id}) - ON CREATE SET aws.firstseen = timestamp() - SET aws.lastupdated = $aws_update_tag - """, - aws_account_id=TEST_ACCOUNT_ID, - aws_update_tag=TEST_UPDATE_TAG, - ) - # Act: Load Test Volumes - data = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_data = cartography.intel.aws.ec2.volumes.transform_volumes(data, TEST_REGION, TEST_ACCOUNT_ID) +@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) +@patch.object(cartography.intel.aws.ec2.volumes, 'get_volumes', return_value=DESCRIBE_VOLUMES) +def test_sync_ebs_volumes_e2e(mock_get_vols, mock_get_instances, neo4j_session): + # Arrange + neo4j_session.run('MATCH (n) DETACH DELETE n;') + boto3_session = MagicMock() + create_test_account(neo4j_session, TEST_ACCOUNT_ID, TEST_UPDATE_TAG) - cartography.intel.aws.ec2.volumes.load_volumes( + # Act: sync_ec2_instances() loads attached ebs volumes + sync_ec2_instances( neo4j_session, - transformed_data, - TEST_REGION, + boto3_session, + [TEST_REGION], TEST_ACCOUNT_ID, TEST_UPDATE_TAG, + {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Assert - expected = { - (TEST_ACCOUNT_ID, 'v-01'), - (TEST_ACCOUNT_ID, 'v-02'), + # Assert that deleteontermination is set by sync_ec2_instances. The encrypted property isn't returned by this API. + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + ('vol-03', True, None), + ('vol-04', True, None), + ('vol-09', True, None), + ('vol-0df', True, None), } - result = neo4j_session.run( - """ - MATCH (n1:AWSAccount)-[:RESOURCE]->(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - actual = { - (r['n1.id'], r['n2.id']) for r in result + # Assert that they are attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - assert actual == expected - + # Assert that we created the account to volume rels correctly + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), + } -@patch.object(cartography.intel.aws.ec2.instances, 'get_ec2_instances', return_value=DESCRIBE_INSTANCES['Reservations']) -def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): - # Arrange: Load in ec2 instances first - boto3_session = MagicMock() - sync_ec2_instances( + # Act + sync_ebs_volumes( neo4j_session, boto3_session, [TEST_REGION], @@ -98,28 +120,46 @@ def test_load_volume_to_instance_rels(mock_get_instances, neo4j_session): TEST_UPDATE_TAG, {'UPDATE_TAG': TEST_UPDATE_TAG, 'AWS_ID': TEST_ACCOUNT_ID}, ) - # Prep the volume data - raw_volumes = tests.data.aws.ec2.volumes.DESCRIBE_VOLUMES - transformed_volumes = cartography.intel.aws.ec2.volumes.transform_volumes(raw_volumes, TEST_REGION, TEST_ACCOUNT_ID) - # Act - cartography.intel.aws.ec2.volumes.load_volume_relationships( - neo4j_session, - transformed_volumes, - TEST_UPDATE_TAG, - ) + # Assert that additional fields such as `encrypted` have been added by sync_ebs_volumes(), while + # deleteontermination has not been overwritten with None by sync_ebs_volumes() + assert check_nodes(neo4j_session, 'EBSVolume', ['id', 'deleteontermination', 'encrypted']) == { + # Attached to the instances initially + ('vol-04', True, None), + ('vol-09', True, None), + # Added by ebs sync + ('vol-03', True, True), + ('vol-0df', True, True), + } - # Assert - result = neo4j_session.run( - """ - MATCH (n1:EC2Instance)<-[:ATTACHED_TO_EC2_INSTANCE]-(n2:EBSVolume) RETURN n1.id, n2.id; - """, - ) - expected = { - ('i-01', 'v-01'), - ('i-02', 'v-02'), + # Assert that they are still attached to the instances + assert check_rels( + neo4j_session, + 'EC2Instance', + 'instanceid', + 'EBSVolume', + 'volumeid', + 'ATTACHED_TO', + rel_direction_right=False, + ) == { + ('i-01', 'vol-0df'), + ('i-02', 'vol-03'), + ('i-03', 'vol-09'), + ('i-04', 'vol-04'), } - actual = { - (r['n1.id'], r['n2.id']) for r in result + + # Assert that the account to volume rels still exist + assert check_rels( + neo4j_session, + 'AWSAccount', + 'id', + 'EBSVolume', + 'volumeid', + 'RESOURCE', + rel_direction_right=True, + ) == { + ('000000000000', 'vol-03'), + ('000000000000', 'vol-04'), + ('000000000000', 'vol-09'), + ('000000000000', 'vol-0df'), } - assert actual == expected