Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
Bug/path duplication (#110)
Browse files Browse the repository at this point in the history
Fixes issues arising from taking MonitoredPath out of the Harvester > ObservedFile > Dataset heirarchy. 

MonitoredPaths are now related to permissions, but dynamically related to ObservedFile and Dataset entries by the ObservedFile path being matched versus MonitoredPath path and regex. 

If path matching proves too demanding on CPU time, MonitoredPaths can be coded to register with ObservedFiles when they attempt to update or create those files.
  • Loading branch information
mjaquiery committed Jun 30, 2023
1 parent 4ac1804 commit cb927e7
Show file tree
Hide file tree
Showing 35 changed files with 560 additions and 496 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/unit-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,20 +76,21 @@ jobs:

- name: Set up secrets
run: |
touch .env.secret
echo "POSTGRES_PASSWORD=$POSTGRES_PASSWORD" > .env.secret
echo "DJANGO_SECRET_KEY=$DJANGO_SECRET_KEY" >> .env.secret
echo "FRONTEND_VIRTUAL_HOST=$FRONTEND_VIRTUAL_HOST" >> .env.secret
echo "VIRTUAL_HOST=$VIRTUAL_HOST" >> .env.secret
echo "POSTGRES_HOST=postgres_test" >> .env.secret
- name: Build the stack
run: docker-compose -f docker-compose.test.yml up -d --build app_test

# Enable tmate debugging of manually-triggered workflows if the input option was provided
- name: Setup tmate session
uses: mxschmitt/action-tmate@v3
if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}

- name: Build the stack
run: docker-compose -f docker-compose.test.yml up -d --build app_test

- name: Run tests
run: docker-compose -f docker-compose.test.yml run --rm app_test bash -c "cd .. && ./server.sh"

Expand Down
21 changes: 8 additions & 13 deletions backend/backend_django/galv/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class MonitoredPath(models.Model):
default=60,
help_text="Number of seconds files must remain stable to be processed"
)
active = models.BooleanField(default=True, null=False)
admin_group = models.ForeignKey(
to=Group,
on_delete=models.CASCADE,
Expand All @@ -123,17 +124,16 @@ def __str__(self):
return self.path

class Meta:
unique_together = [['harvester', 'path']]
unique_together = [['harvester', 'path', 'regex']]


class ObservedFile(models.Model):
monitored_path = models.ForeignKey(
to=MonitoredPath,
related_name='files',
path = models.TextField(help_text="Absolute file path")
harvester = models.ForeignKey(
to=Harvester,
on_delete=models.CASCADE,
help_text="Harvester directory Path where file is located"
help_text="Harvester that harvested the File"
)
relative_path = models.TextField(help_text="File path from Harvester directory Path")
last_observed_size = models.PositiveBigIntegerField(
null=False,
default=0,
Expand All @@ -151,10 +151,10 @@ class ObservedFile(models.Model):
)

def __str__(self):
return self.relative_path
return self.path

class Meta:
unique_together = [['monitored_path', 'relative_path']]
unique_together = [['path', 'harvester']]


class HarvestError(models.Model):
Expand All @@ -164,11 +164,6 @@ class HarvestError(models.Model):
on_delete=models.CASCADE,
help_text="Harvester which reported the error"
)
path = models.ForeignKey(
to=MonitoredPath,
on_delete=models.DO_NOTHING,
help_text="Path to data directory where error originated"
)
file = models.ForeignKey(
to=ObservedFile,
related_name='errors',
Expand Down
71 changes: 31 additions & 40 deletions backend/backend_django/galv/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
DataColumn, \
TimeseriesRangeLabel, \
KnoxAuthToken
from django.db import connection
from .utils import get_monitored_paths
from django.utils import timezone
from django.contrib.auth.models import User, Group
from django.conf.global_settings import DATA_UPLOAD_MAX_MEMORY_SIZE
Expand Down Expand Up @@ -224,18 +224,11 @@ class MonitoredPathSerializer(serializers.HyperlinkedModelSerializer):

@extend_schema_field(UserSetSerializer(many=True))
def get_user_sets(self, instance):
group_ids = [instance.harvester.admin_group.id, instance.admin_group.id, instance.user_group.id]
group_ids = [instance.admin_group.id, instance.user_group.id]
return UserSetSerializer(
Group.objects.filter(id__in=group_ids).order_by('id'),
context={
'request': self.context.get('request'),
instance.harvester.admin_group.id: {
'name': 'Harvester admins',
'description': (
'Harvester administrators can alter any of the harvester\'s paths or datasets.'
),
'is_admin': True
},
instance.admin_group.id: {
'name': 'Admins',
'description': (
Expand All @@ -246,7 +239,7 @@ def get_user_sets(self, instance):
instance.user_group.id: {
'name': 'Users',
'description': (
'Users can monitored paths and their datasets.'
'Users can view monitored paths and edit their datasets.'
)
}
},
Expand All @@ -259,28 +252,13 @@ def validate_path(self, value):
except BaseException as e:
raise ValidationError(f"Invalid path: {e.__context__}")
abs_path = os.path.abspath(value)
if self.instance is not None:
harvester = self.instance.harvester
else:
if self.instance is None:
try:
pk = resolve(urlparse(self.initial_data['harvester']).path).kwargs['pk']
harvester = Harvester.objects.get(id=pk)
Harvester.objects.get(id=pk)
except BaseException:
raise ValidationError("Harvester not found")
all_paths = MonitoredPath.objects.filter(harvester=harvester)
for p in all_paths:
if self.instance is None or p.id != self.instance.id:
other_abs_path = os.path.abspath(p.path)
if abs_path.startswith(other_abs_path):
if abs_path == os.path.abspath(p.path):
raise ValidationError(f"Path already exists on harvester")
raise ValidationError(f"Path is a subpath of existing path {p.path}")
if other_abs_path.startswith(abs_path):
raise ValidationError((
f"Path is a parent of existing path {p.path}. "
f"Delete that path before creating this one."
))
return value
return abs_path

def validate_stable_time(self, value):
try:
Expand Down Expand Up @@ -315,7 +293,7 @@ def validate(self, attrs):

class Meta:
model = MonitoredPath
fields = ['url', 'id', 'path', 'regex', 'stable_time', 'harvester', 'user_sets']
fields = ['url', 'id', 'path', 'regex', 'stable_time', 'active', 'harvester', 'user_sets']
read_only_fields = ['url', 'id', 'harvester', 'user_sets']
extra_kwargs = augment_extra_kwargs()

Expand All @@ -324,17 +302,20 @@ class MonitoredPathCreateSerializer(MonitoredPathSerializer):

def create(self, validated_data):
stable_time = validated_data.get('stable_time', 60)
regex = validated_data.get('regex', '.*')
# Create Path
try:
monitored_path = MonitoredPath.objects.create(
path=validated_data['path'],
harvester=validated_data['harvester'],
stable_time=stable_time
stable_time=stable_time,
regex=regex
)
except (TypeError, ValueError):
monitored_path = MonitoredPath.objects.create(
path=validated_data['path'],
harvester=validated_data['harvester']
harvester=validated_data['harvester'],
regex=regex
)
# Create user/admin groups
monitored_path.admin_group = Group.objects.create(
Expand All @@ -353,8 +334,11 @@ def to_representation(self, instance):

class Meta:
model = MonitoredPath
fields = ['path', 'stable_time', 'harvester']
extra_metadata = {'stable_time': {'required': False}}
fields = ['path', 'regex', 'stable_time', 'harvester']
extra_metadata = {
'regex': {'required': False},
'stable_time': {'required': False},
}


class ObservedFileSerializer(serializers.HyperlinkedModelSerializer):
Expand Down Expand Up @@ -383,13 +367,12 @@ def get_upload_info(self, instance) -> dict | None:
class Meta:
model = ObservedFile
fields = [
'url', 'id',
'monitored_path', 'relative_path',
'url', 'id', 'harvester', 'path',
'state', 'last_observed_time', 'last_observed_size', 'errors',
'datasets', 'upload_info'
]
read_only_fields = [
'url', 'id', 'monitored_path', 'relative_path',
'url', 'id', 'harvester', 'path',
'last_observed_time', 'last_observed_size', 'datasets',
'errors'
]
Expand All @@ -402,7 +385,7 @@ class Meta:
class HarvestErrorSerializer(serializers.HyperlinkedModelSerializer):
class Meta:
model = HarvestError
fields = ['url', 'id', 'harvester', 'path', 'file', 'error', 'timestamp']
fields = ['url', 'id', 'harvester', 'file', 'error', 'timestamp']
extra_kwargs = augment_extra_kwargs()


Expand Down Expand Up @@ -466,9 +449,16 @@ class DatasetSerializer(serializers.HyperlinkedModelSerializer):

@extend_schema_field(UserSetSerializer(many=True))
def get_user_sets(self, instance):
return MonitoredPathSerializer(
instance.file.monitored_path, context={'request': self.context.get('request')}
).data.get('user_sets')
monitored_paths = get_monitored_paths(instance.file.path, instance.file.harvester)
user_sets = []
ids = []
for mp in monitored_paths:
sets = MonitoredPathSerializer(mp, context=self.context).data.get('user_sets')
sets = [{**s, 'name': f"MonitoredPath_{mp.id}-{s['name']}"} for s in sets]
sets = [s for s in sets if s['id'] not in ids]
ids += [s['id'] for s in sets]
user_sets = [user_sets, *sets]
return user_sets

class Meta:
model = Dataset
Expand Down Expand Up @@ -599,6 +589,7 @@ class HarvesterConfigSerializer(HarvesterSerializer):
standard_columns = serializers.SerializerMethodField(help_text="Column Types recognised by the initial database")
max_upload_bytes = serializers.SerializerMethodField(help_text="Maximum upload size (bytes)")
deleted_environment_variables = serializers.SerializerMethodField(help_text="Envvars to unset")
monitored_paths = MonitoredPathSerializer(many=True, read_only=True, help_text="Directories to harvest")

@extend_schema_field(DataUnitSerializer(many=True))
def get_standard_units(self, instance):
Expand Down
6 changes: 3 additions & 3 deletions backend/backend_django/galv/tests/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ def groups(obj, *args, **kwargs):
class ObservedFileFactory(factory.django.DjangoModelFactory):
class Meta:
model = ObservedFile
django_get_or_create = ('monitored_path', 'relative_path',)
django_get_or_create = ('harvester', 'path')

relative_path = factory.Faker('file_name')
monitored_path = factory.SubFactory(MonitoredPathFactory)
path = factory.Faker('file_path')
harvester = factory.SubFactory(HarvesterFactory)


class DatasetFactory(factory.django.DjangoModelFactory):
Expand Down
7 changes: 4 additions & 3 deletions backend/backend_django/galv/tests/test_view_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .factories import UserFactory, \
HarvesterFactory, \
DatasetFactory
DatasetFactory, MonitoredPathFactory

logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)
Expand All @@ -19,11 +19,12 @@
class DatasetTests(APITestCase):
def setUp(self):
self.harvester = HarvesterFactory.create(name='Test Dataset')
self.dataset = DatasetFactory.create(file__monitored_path__harvester=self.harvester)
self.dataset = DatasetFactory.create(file__harvester=self.harvester)
self.monitored_path = MonitoredPathFactory.create(harvester=self.harvester, path="/")
self.user = UserFactory.create(username='test_user')
self.admin_user = UserFactory.create(username='test_user_admin')
self.user.groups.add(self.harvester.user_group)
self.admin_user.groups.add(self.dataset.file.monitored_path.admin_group)
self.admin_user.groups.add(self.monitored_path.admin_group)
self.url = reverse('dataset-detail', args=(self.dataset.id,))

def test_view(self):
Expand Down
6 changes: 3 additions & 3 deletions backend/backend_django/galv/tests/test_view_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
class ObservedFileTests(APITestCase):
def setUp(self):
self.harvester = HarvesterFactory.create(name='Test Files')
self.path = MonitoredPathFactory.create(harvester=self.harvester)
self.files = ObservedFileFactory.create_batch(size=5, monitored_path=self.path)
self.path = MonitoredPathFactory.create(harvester=self.harvester, path="/")
self.files = ObservedFileFactory.create_batch(size=5, harvester=self.harvester)
self.user = UserFactory.create(username='test_user')
self.admin_user = UserFactory.create(username='test_user_admin')
self.user.groups.add(self.harvester.user_group)
Expand All @@ -42,7 +42,7 @@ def test_view(self):

def test_reimport(self):
self.client.force_login(self.admin_user)
print("Test view path")
print("Test reimport")
url = reverse('observedfile-reimport', args=(self.files[0].id,))
self.assertEqual(self.client.get(url).status_code, status.HTTP_200_OK)
self.assertEqual(ObservedFile.objects.get(id=self.files[0].id).state, FileState.RETRY_IMPORT)
Expand Down
Loading

0 comments on commit cb927e7

Please sign in to comment.