Friday, January 21, 2022

Re: Search & replace an object in the database

from django.contrib.contenttypes.fields import GenericRelation, GenericForeignKey
from django.contrib.contenttypes.models import ContentType
from django.db.models import sql
from django.db.models.deletion import Collector
from django.db.utils import IntegrityError


class LogMessage:
def __init__(self, instance, message=""):
self._instance = instance
self._message = message

def __str__(self):
try:
model_name = self._instance._meta.verbose_name
except AttributeError:
model_name = self._instance.__class__.__name__

return f'{model_name}, {str(self._instance)}, {self._message}'


class ObjectReplacer:
def __init__(self):
self._errors = []
self._updates = []

def replace_object(self, old_obj, new_obj, using='default'):
collector = Collector(using)
collector.collect([old_obj], keep_parents=True)
self.__update_fields(collector, new_obj)
self.__update_fast_deletes(collector, new_obj)
self.__update_the_rest(collector, old_obj, new_obj)

self.show_updates()
self.show_errors()

def __update_fields(self, collector, new_obj):
for model, instances_for_fieldvalues in collector.field_updates.items():
for (field, value), instances in instances_for_fieldvalues.items():
if self.__update_batch(model, [obj.pk for obj in instances], field.name, new_obj.pk, collector.using):
self.add_updates([LogMessage(obj) for obj in instances])

def __update_fast_deletes(self, collector, new_obj):
for qs in collector.fast_deletes:
print(qs)
return []

def __update_the_rest(self, collector, old_obj, new_obj):
# delete instances
for model, instances in collector.data.items():
# Ok, django provides us with model and instances.
# We'll need to find the referencing field and update with the new PK.
# print(model, instances)

for instance in instances:
for field_opts in instance._meta.get_fields(include_hidden=True):
if hasattr(field_opts, 'related_model') and field_opts.related_model == old_obj.__class__:
if instance == old_obj:
# Don't attempt to change the old object itself.
continue

if hasattr(field_opts, 'field') and isinstance(field_opts.field, GenericRelation):
self.__update_generic_relation(instance, collector, old_obj, new_obj)
else:
self.__simple_update(model, instance, collector, new_obj, field_opts)

def __simple_update(self, model, instance, collector, new_obj, field_opts):
if self.__update_batch(model, [instance.pk], field_opts.name, new_obj.pk, collector.using):
self.add_update(LogMessage(instance))

def __update_generic_relation(self, instance, collector, old_obj, new_obj):
# If it's a generic relation, we search the fields for a GenericForeignKey field.
# This will give us information about the desired content type and object id.
for gen_field_opts in instance._meta.get_fields():
if isinstance(gen_field_opts, GenericForeignKey):
ct_for_this_field = getattr(instance, gen_field_opts.ct_field)
desired_ct = ContentType.objects.get_for_model(old_obj)

if ct_for_this_field == desired_ct:
if self.__update_batch(instance.__class__, [instance.pk], getattr(gen_field_opts, 'fk_field'),
new_obj.pk, collector.using):
self.add_update(LogMessage(instance))
return

def __update_batch(self, model, pks, field_name, new_pk, using):
query = sql.UpdateQuery(model)
try:
query.update_batch(pks, {field_name: new_pk}, using)
except IntegrityError as error:
self.add_errors([LogMessage(None, str(error))])
return False

return True

def add_error(self, log_message):
self._errors.append(log_message)

def add_errors(self, log_messages):
self._errors += log_messages

def add_update(self, log_message):
self._updates.append(log_message)

def add_updates(self, log_messages):
self._updates += log_messages

def show_updates(self):
print("Updated:")
for log_msg in self._updates:
print("\t", str(log_msg))

def show_errors(self):
print("Errors:")
for log_msg in self._errors:
print("\t", str(log_msg))
Here's an update:

I sticked to the idea of using collector; it categorizes models from a deletion perspective into:
  • fast_deletes - models that can be easily deleted, without checking for other relations
  • update_fields - field that have SET_NULL, i.e. relations to the customer objects that are only updated
  • … all the rest
So I kept the structure from Collector, looping the three categories mentioned above. It works ok; for each instance, you'll have the check instance._meta.get_fields() to find the field related to the old customer record.
Fixing GenericRelations require the most code, because you'll have to find the content type, object id (and maybe content object) fields that are related to the old customer record.

Moreover, you'll have to be aware that you may violate foreign key constraints when replacing the old id.

My implementation is attached, if it's interesting for anybody.

Have a nice weekend y'all
jools
jools schrieb am Freitag, 21. Januar 2022 um 16:05:58 UTC+1:
Dear Django users,

here's an interesting case and I'm curious whether somebody can point me in the right direction.

Here's the problem:
My company has a database of customers. Over the years, many duplicates have been created. For cleaning up duplicates, I'd like to have a search-and-replace functionality.

I.e.: Replace all references in the database to customer <old> with customer <new>.
The database schema has quite a bit of complexity, so I'm aiming to find a very generic solution.

Approach:
Django has a bit of functionality to find all references to an existing object, namely django.db.models.deletion.Collector and I'm using it to find all references.

Though, the "replace" logic seems quite hard to get right:
  • It has to keep parent links intact
  • It has to recognize references in parent models (Customer model is derived from the concrete model Actor)
  • It has to recognize generic relations, built with Django's content types

My stub implementation comes below.

  • Has anybody else implemented sth like this, e.g. in helper library?
  • Do you think the approach is right?
  • What would you differently?
  • Any caveats that you know of?
Best regards
Jools

Stub implementation:

from django.db.models.deletion import Collector, ProtectedError, RestrictedError


def replace_object(old_obj, new_obj, using='default'):
    """
    Replace an old object with a new object throughout the database.
    """
    collector = Collector(using)

    try:
        collector.collect([old_obj])
    except ProtectedError:
                pass
    except RestrictedError:
                pass

    for model, obj in collector.instances_with_model():
        for field_opts in obj._meta.get_fields():
            if __is_qualified(field_opts, new_obj):
                setattr(obj, field_opts.name, new_obj)
                obj.save()

def __is_qualified(field_opts, new_obj):
    if not hasattr(field_opts, 'related_model'):
        return False
   
    # This check is wrong for multi-table inheritance, I suppose?
    if field_opts.related_model != new_obj.__class__:
        return False
    if hasattr(field_opts, 'parent_link') and field_opts.parent_link is True:
        return False

    return True

--
You received this message because you are subscribed to the Google Groups "Django users" group.
To unsubscribe from this group and stop receiving emails from it, send an email to django-users+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/django-users/1a094625-ab1c-4763-81f2-d8c77ffab870n%40googlegroups.com.

No comments:

Post a Comment