From: olivier Date: Fri, 24 Apr 2009 16:58:58 +0000 (+0000) Subject: migration: add collection code converter X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=b0a22a9f257f2f449c27d52f4c25b756e1490503;p=telemeta-data.git migration: add collection code converter git-svn-id: http://svn.parisson.org/svn/crem@87 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- diff --git a/trunk/import/migration/tasks/collections.py b/trunk/import/migration/tasks/collections.py index eff2b3b..6dd97c5 100644 --- a/trunk/import/migration/tasks/collections.py +++ b/trunk/import/migration/tasks/collections.py @@ -36,6 +36,7 @@ from api import IDataMigrator from core import DataMigrator from _mysql_exceptions import IntegrityError from MySQLdb.constants.ER import DUP_ENTRY +import re class CollectionsCopyMigrator(DataMigrator): """Perform a preliminary raw copy of the collection table""" @@ -84,11 +85,16 @@ class CollectionsCopyMigrator(DataMigrator): target_fields = [str(a[0]) for a in assign] src_fields = [str(a[1]) for a in assign] + self.target_cursor.execute("DELETE FROM media_collections") + self.src_cursor.execute("SELECT COUNT(*) FROM %s.Support" % self.src_db_name) count = self.src_cursor.fetchone()[0] self.stats = { 'total': count, 'imported': 0, 'ignored': 0} for offset in range(0, count): + if offset % 400 == 0: + self.step() + query = "INSERT INTO %s.media_collections (\n %s\n)\n" \ "SELECT \n %s\n FROM %s.Support AS s LIMIT %d, 1" % ( self.target_db_name, @@ -174,7 +180,127 @@ class CollectionsEnumMigrator(DataMigrator): offset += self.src_cursor.rowcount self.step() - +class CollectionsCodeConverter(DataMigrator): + """Convert old to new-style collection codes""" + + implements(IDataMigrator) + + patterns = [ + { 'format': 'BM.aaa.nnn.mmm', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'BM.aaaa.nnn.mmm/pp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaaa.nnn.mmm', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'BM.aaaa.nnn.mmm/', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/$'}, + { 'format': 'BM.aaaa.nnn.mmm/ppp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, + { 'format': 'BM.aaaa.nnn.mm/pp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{2})/[0-9]{2}$'}, + { 'format': 'BM.aaaa.nnn', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})$'}, + { 'format': 'BM.aaa.nnn.mmm/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaa.nnn FANTOME', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3}) FANTOME$'}, + { 'format': 'BM.aaa.nnn', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'BM.aaa.nnnBISoo/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})BIS([0-9]{2})/[0-9]{2}$'}, + { 'format': 'BM.aaa.nnn.mmm.ppp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})\.[0-9]{3}$'}, + { 'format': 'BM.aaa.nnn.mmm/ppp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, + { 'format': 'BM.aaa.nnn/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaa.nnn-BIS.ooo/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})-BIS\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaaa.nnn.mmm/NN', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/NN$'}, + { 'format': 'BM.aaa.nnn.mmm/pp-DEPOT', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}-DEPOT$'}, + { 'format': 'BM.aaa.nnn.mmm-o>p', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]>[0-9]$'}, + { 'format': 'CY.aaaa.nnn', 'regex': r'^(CY)\.([0-9]{4})\.([0-9]{3})$'}, + { 'format': 'DI.aaaa.nnn.mmm', 'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'DI.aaaa.nnn.mmm/pp', 'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'DI.aaa.nnn.mmm', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'DI.aaa.nnn.mmm/pp', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'DI.aaa.nnn.mmm-o/p', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]/[0-9]$'}, + { 'format': 'FANTOME 2*', 'regex': r'FANTOME 2\*$'}, + ] + + def get_name(self): + return "collections:code" + + def process(self): + + self.target_cursor.execute("SELECT COUNT(*) FROM media_collections") + + self.stats = { + 'total': self.target_cursor.fetchone()[0], + 'matched': 0, + 'unmatched': 0, + 'converted': 0, + 'removed': 0, + 'remaining': 0 + } + read_cursor = self.target_db.cursor() + for pattern in self.patterns: + read_cursor.execute("SELECT old_code FROM media_collections WHERE old_code REGEXP %s", + (pattern['regex'],)) + while True: + row = read_cursor.fetchone() + if not row: + break + + self.stats['matched'] += 1 + if self.stats['matched'] % 500 == 0: + self.step() + + old_code = row[0] + match = re.match(pattern['regex'], old_code) + + published = False + year = -1 + serial = -1 + physical = -1 + + if (match.lastindex >= 1): + published = (match.group(1) == 'DI') + if (match.lastindex >= 2): + year = int(match.group(2)) + if (match.lastindex >= 3): + serial = int(match.group(3)) + if (match.lastindex >= 4): + physical = int(match.group(4)) + + + if (year == -1 or serial == -1): + self.target_cursor.execute("DELETE FROM media_collections WHERE old_code = %s", (old_code,)); + print 'Removed record, old code is missing year or serial: %s' % old_code + self.stats['removed'] += 1 + continue + tokens = [] + if published: + tokens.append('CNRSMH_E') + else: + tokens.append('CNRSMH_I') + + if year < 1000: + if year < 100: + year += 2000 + else: + year += 1000 + + tokens.append(str(year)) + tokens.append(str(serial).rjust(3, '0')) + + if published: + if physical != -1: + tokens.append(str(physical).rjust(3, '0')) + else: + tokens.append('001') + + new_code = '_'.join(tokens) + + try: + self.target_cursor.execute("UPDATE media_collections SET code = %s WHERE old_code = %s", (new_code, old_code)) + self.stats['converted'] += 1 + except IntegrityError, e: + (errno, errstr) = e + if errno == DUP_ENTRY: + self.target_cursor.execute("DELETE FROM media_collections WHERE old_code = %s", (old_code,)); + print 'Removed record, code conversion caused a duplicate entry: %s -> %s' % (old_code, new_code) + self.stats['removed'] += 1 + else: + raise e + + self.stats['remaining'] = self.stats['total'] - self.stats['removed'] + self.stats['unmatched'] = self.stats['total'] - self.stats['matched']