Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix migration 055 #1428

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

from collections import defaultdict
from django.db import migrations
from django.db import models
from django.core.paginator import Paginator


class Migration(migrations.Migration):
Expand All @@ -19,29 +20,36 @@ def remove_duped_changelogs(apps, schema_editor):

models_list = [PackageChangeLog, VulnerabilityChangeLog]

common_fields = ('actor_name', 'action_type', 'source_url')
for model in models_list:
# Identify duplicate records based on actor_name, action_type, and source_url
duplicate_records = model.objects.values('actor_name', 'action_type', 'source_url').annotate(count=models.Count('id')).filter(count__gt=1)
record_groups = defaultdict(list)
fields = set()
key = tuple()
if model == PackageChangeLog:
fields = common_fields + ('package', 'related_vulnerability')
all_records = model.objects.select_related("package").all()
elif model == VulnerabilityChangeLog:
fields = common_fields + ('vulnerability',)
all_records = model.objects.select_related("vulnerability").all()

to_be_deleted = list()
print("Total number of records", all_records.count())
for record in paginated(all_records):
print(",", end="")
key = tuple(getattr(record, attr) for attr in fields)
record_groups[key].append(record.id)

for duplicate_set in duplicate_records:
# Get the records for the current duplicate set
records_to_delete = model.objects.filter(
actor_name=duplicate_set['actor_name'],
action_type=duplicate_set['action_type'],
source_url=duplicate_set['source_url']
).order_by('-software_version')
to_be_deleted = []
for record_ids in record_groups.values():
if len(record_ids) == 1:
continue
print(".", end="")
# We exclude the oldest ID which is the last one based on the standard
# ordering by decreasing the action time
to_be_deleted.extend(record_ids[:-1])

# Keep the record with the older software version
record_to_keep = records_to_delete.last()

# Delete the records with the newer software version
to_be_deleted.extend(records_to_delete.exclude(id=record_to_keep.id))

to_be_deleted = list(set(to_be_deleted))
to_be_deleted = [rec.id for rec in to_be_deleted]
model.objects.filter(id__in = to_be_deleted).delete()
chunks = [to_be_deleted[x:x+10000] for x in range(0, len(to_be_deleted), 10000)]
for chunk in chunks:
model.objects.filter(id__in=chunk).delete()

dependencies = [
("vulnerabilities", "0054_alter_packagechangelog_software_version_and_more"),
Expand All @@ -50,3 +58,18 @@ def remove_duped_changelogs(apps, schema_editor):
operations = [
migrations.RunPython(remove_duped_changelogs, reverse_code=migrations.RunPython.noop),
]


def paginated(qs, per_page=5000):
"""
Iterate over a (large) QuerySet by chunks of ``per_page`` items.
This technique is essential for preventing memory issues when iterating
See these links for inspiration:
https://nextlinklabs.com/resources/insights/django-big-data-iteration
https://stackoverflow.com/questions/4222176/why-is-iterating-through-a-large-django-queryset-consuming-massive-amounts-of-me/
"""
paginator = Paginator(qs, per_page=per_page)
for page_number in paginator.page_range:
page = paginator.page(page_number)
for object in page.object_list:
yield object
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Generated by Django 4.1.13 on 2024-02-28 15:58

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("vulnerabilities", "0055_remove_changelogs_with_same_data_different_software_version"),
]

operations = [
migrations.AlterModelOptions(
name="packagechangelog",
options={},
),
migrations.AlterModelOptions(
name="vulnerabilitychangelog",
options={},
),
migrations.AlterField(
model_name="packagechangelog",
name="software_version",
field=models.CharField(
default="34.0.0rc3",
help_text="Version of the software at the time of change",
max_length=100,
),
),
migrations.AlterField(
model_name="vulnerabilitychangelog",
name="software_version",
field=models.CharField(
default="34.0.0rc3",
help_text="Version of the software at the time of change",
max_length=100,
),
),
migrations.AlterUniqueTogether(
name="packagechangelog",
unique_together={
(
"action_time",
"actor_name",
"action_type",
"source_url",
"related_vulnerability",
"package",
)
},
),
migrations.AlterUniqueTogether(
name="vulnerabilitychangelog",
unique_together={
("action_time", "actor_name", "action_type", "source_url", "vulnerability")
},
),
]

This file was deleted.

This file was deleted.

8 changes: 8 additions & 0 deletions vulnerabilities/tests/test_data_migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,14 @@ def setUpBeforeMigration(self, apps):
package=pkg1,
related_vulnerability=vuln,
)
PackageChangeLog.objects.create(
actor_name="Nginx",
action_type=1,
source_url="test",
software_version="2",
package=pkg1,
related_vulnerability=vuln,
)
VulnerabilityChangeLog.objects.create(
actor_name="Nginx",
action_type=1,
Expand Down