]> git.parisson.com Git - telemeta-data.git/commitdiff
migration: add collection code converter
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Fri, 24 Apr 2009 16:58:58 +0000 (16:58 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Fri, 24 Apr 2009 16:58:58 +0000 (16:58 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@87 3bf09e05-f825-4182-b9bc-eedd7160adf0

trunk/import/migration/tasks/collections.py

index eff2b3ba349b44a4856750cd42bc609b62048dfa..6dd97c5e5e74be551efeda3684076560cc79bc6e 100644 (file)
@@ -36,6 +36,7 @@ from api import IDataMigrator
 from core import DataMigrator
 from _mysql_exceptions import IntegrityError
 from MySQLdb.constants.ER import DUP_ENTRY
+import re
 
 class CollectionsCopyMigrator(DataMigrator):
     """Perform a preliminary raw copy of the collection table"""
@@ -84,11 +85,16 @@ class CollectionsCopyMigrator(DataMigrator):
         target_fields   = [str(a[0]) for a in assign]
         src_fields      = [str(a[1]) for a in assign]
 
+        self.target_cursor.execute("DELETE FROM media_collections")
+
         self.src_cursor.execute("SELECT COUNT(*) FROM %s.Support" %  self.src_db_name)
         count = self.src_cursor.fetchone()[0]
         self.stats = { 'total': count, 'imported': 0, 'ignored': 0}
 
         for offset in range(0, count):
+            if offset % 400 == 0:
+                self.step()
+
             query = "INSERT INTO %s.media_collections (\n  %s\n)\n" \
                     "SELECT \n  %s\n FROM %s.Support AS s LIMIT %d, 1" % (
                         self.target_db_name, 
@@ -174,7 +180,127 @@ class CollectionsEnumMigrator(DataMigrator):
             offset += self.src_cursor.rowcount                
             self.step()                            
 
-                            
+class CollectionsCodeConverter(DataMigrator):
+    """Convert old to new-style collection codes"""
+
+    implements(IDataMigrator)
+
+    patterns = [
+        { 'format': 'BM.aaa.nnn.mmm',           'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, 
+        { 'format': 'BM.aaaa.nnn.mmm/pp',       'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+        { 'format': 'BM.aaaa.nnn.mmm',          'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, 
+        { 'format': 'BM.aaaa.nnn.mmm/',         'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/$'}, 
+        { 'format': 'BM.aaaa.nnn.mmm/ppp',      'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, 
+        { 'format': 'BM.aaaa.nnn.mm/pp',        'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{2})/[0-9]{2}$'}, 
+        { 'format': 'BM.aaaa.nnn',              'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})$'}, 
+        { 'format': 'BM.aaa.nnn.mmm/pp',        'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+        { 'format': 'BM.aaa.nnn FANTOME',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3}) FANTOME$'}, 
+        { 'format': 'BM.aaa.nnn',               'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})$'}, 
+        { 'format': 'BM.aaa.nnnBISoo/pp',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})BIS([0-9]{2})/[0-9]{2}$'}, 
+        { 'format': 'BM.aaa.nnn.mmm.ppp',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})\.[0-9]{3}$'}, 
+        { 'format': 'BM.aaa.nnn.mmm/ppp',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, 
+        { 'format': 'BM.aaa.nnn/pp',            'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+        { 'format': 'BM.aaa.nnn-BIS.ooo/pp',    'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})-BIS\.([0-9]{3})/[0-9]{2}$'}, 
+        { 'format': 'BM.aaaa.nnn.mmm/NN',       'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/NN$'}, 
+        { 'format': 'BM.aaa.nnn.mmm/pp-DEPOT',  'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}-DEPOT$'}, 
+        { 'format': 'BM.aaa.nnn.mmm-o>p',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]>[0-9]$'}, 
+        { 'format': 'CY.aaaa.nnn',              'regex': r'^(CY)\.([0-9]{4})\.([0-9]{3})$'}, 
+        { 'format': 'DI.aaaa.nnn.mmm',          'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, 
+        { 'format': 'DI.aaaa.nnn.mmm/pp',       'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+        { 'format': 'DI.aaa.nnn.mmm',           'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, 
+        { 'format': 'DI.aaa.nnn.mmm/pp',        'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+        { 'format': 'DI.aaa.nnn.mmm-o/p',       'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]/[0-9]$'}, 
+        { 'format': 'FANTOME 2*',               'regex': r'FANTOME 2\*$'}, 
+    ]
+
+    def get_name(self):
+        return "collections:code"
+
+    def process(self):
+       
+        self.target_cursor.execute("SELECT COUNT(*) FROM media_collections")
+
+        self.stats = {
+            'total':      self.target_cursor.fetchone()[0],
+            'matched':    0,
+            'unmatched':  0,
+            'converted':  0,
+            'removed':    0,
+            'remaining':  0
+        }
 
+        read_cursor = self.target_db.cursor()
 
+        for pattern in self.patterns:
+            read_cursor.execute("SELECT old_code FROM media_collections WHERE old_code REGEXP %s", 
+                                       (pattern['regex'],))
+            while True:
+                row = read_cursor.fetchone()
+                if not row:
+                    break
+  
+                self.stats['matched'] += 1
+                if self.stats['matched'] % 500 == 0:
+                    self.step()
+
+                old_code = row[0]
+                match = re.match(pattern['regex'], old_code)
+
+                published = False
+                year = -1
+                serial = -1
+                physical = -1
+
+                if (match.lastindex >= 1):
+                    published = (match.group(1) == 'DI')
+                if (match.lastindex >= 2):
+                    year = int(match.group(2)) 
+                if (match.lastindex >= 3):
+                    serial = int(match.group(3))
+                if (match.lastindex >= 4):
+                    physical = int(match.group(4)) 
+
+
+                if (year == -1 or serial == -1):
+                    self.target_cursor.execute("DELETE FROM media_collections WHERE old_code = %s", (old_code,));
+                    print 'Removed record, old code is missing year or serial: %s' % old_code
+                    self.stats['removed'] += 1
+                    continue
         
+                tokens = []
+                if published:
+                    tokens.append('CNRSMH_E')
+                else:                
+                    tokens.append('CNRSMH_I')
+                
+                if year < 1000:
+                    if year < 100:
+                        year += 2000
+                    else:
+                        year += 1000
+
+                tokens.append(str(year))
+                tokens.append(str(serial).rjust(3, '0'))
+
+                if published:
+                    if physical != -1:
+                        tokens.append(str(physical).rjust(3, '0'))
+                    else:
+                        tokens.append('001')
+                
+                new_code = '_'.join(tokens)
+
+                try:
+                    self.target_cursor.execute("UPDATE media_collections SET code = %s WHERE old_code = %s", (new_code, old_code))
+                    self.stats['converted'] += 1
+                except IntegrityError, e:
+                    (errno, errstr) = e
+                    if errno == DUP_ENTRY:
+                        self.target_cursor.execute("DELETE FROM media_collections WHERE old_code = %s", (old_code,));
+                        print 'Removed record, code conversion caused a duplicate entry: %s -> %s' % (old_code, new_code)
+                        self.stats['removed'] += 1
+                    else:
+                        raise e
+
+        self.stats['remaining'] = self.stats['total'] - self.stats['removed']
+        self.stats['unmatched'] = self.stats['total'] - self.stats['matched']