]> git.parisson.com Git - telemeta-data.git/commitdiff
migration: moved dates-related tasks to a specific module, improved items date conver...
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Thu, 11 Jun 2009 16:02:59 +0000 (16:02 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Thu, 11 Jun 2009 16:02:59 +0000 (16:02 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@98 3bf09e05-f825-4182-b9bc-eedd7160adf0

trunk/import/migration/tasks/__init__.py
trunk/import/migration/tasks/collections.py
trunk/import/migration/tasks/core.py
trunk/import/migration/tasks/dates.py [new file with mode: 0644]
trunk/import/migration/tasks/items.py

index 5598a9f7aad7bbd91b49572c5e6e42d4981d3ad6..b28e9b875137ce5b50cdf6a300b98b4576543bce 100644 (file)
@@ -38,3 +38,4 @@ import ethnic
 import publishers
 import collections
 import items
+import dates
index e37bfb47629672c8fc831e47db0c1d9085819aae..509f38445afb116376259ce316ce84f604d36553 100644 (file)
@@ -37,7 +37,6 @@ from core import DataMigrator, EnumMapper
 from _mysql_exceptions import IntegrityError
 from MySQLdb.constants.ER import DUP_ENTRY
 import re
-from datetime import date
 
 class CollectionsCopyMigrator(DataMigrator):
     """Perform a preliminary raw copy of the collection table"""
@@ -334,75 +333,6 @@ class CollectionsPublishersMapper(DataMigrator):
         self.target_cursor.execute("SELECT COUNT(*) FROM media_collections WHERE publisher_collection_id IS NOT NULL")
         self.stats['with_collection'] = self.target_cursor.fetchone()[0]
 
-class CollectionsYearConverter(DataMigrator):
-    """Convert collection recording years"""
-
-    implements(IDataMigrator)
-
-    def get_name(self):
-        return "collections:date"
-
-    def parse_year(self, year):
-        if year < 100:
-            return year + 1900
-        elif year >= 1900 and year <= date.today().year:
-            return year
-
-        return 0
-
-    def process(self):
-        
-        self.stats = {
-            'total':          0,
-            'unsignificant':  0,
-            'unparsed':       0,
-            'migrated':       0,
-            'empty':          0,
-            'incoherent':     0
-        }
-   
-        self.src_cursor.execute("SELECT Cote, Annee_Enr FROM Support")
-
-        while True:
-            row = self.src_cursor.fetchone()
-            if not row:
-                break
-
-            old_code = row[0]                
-            year_str = row[1]
-
-            from_year = 0
-            to_year   = 0
-
-            self.stats['total'] += 1
-            if year_str == '':
-                self.stats['empty'] += 1
-            elif re.match('^[+-]?[Nn12?]$', year_str):
-                self.stats['unsignificant'] += 1
-            else:
-                match = re.match(r'^<?([0-9]{2,4})[;/,>-]\'?([0-9]{2,4})[\';/,>?-]?$', year_str)
-                if not match:
-                    match = re.match(r'^([0-9]{2,4})[;/,>?-]?$', year_str)
-                
-                if match:
-                    from_year = self.parse_year(int(match.group(1)))
-                    if match.lastindex > 1:
-                        to_year = self.parse_year(int(match.group(2)))
-                    else:
-                        to_year = from_year
-
-                    if from_year or to_year and (not to_year or to_year >= from_year):
-                        self.stats['migrated'] += 1
-                    else:    
-                        self.stats['incoherent'] += 1
-
-                else:
-                    self.stats['unparsed'] += 1
-
-            self.target_cursor.execute("UPDATE media_collections SET recorded_from_year = %s, "
-                                       "recorded_to_year = %s WHERE old_code = %s", (from_year, to_year, old_code))
-
-
 class CollectionsCollectorNormalizer(DataMigrator):
     """Migrate collector_is_creator flag"""
 
index 85fdad3c0af70657376237edbf01cdc0d57cc417..e26b7435e55268fb01d77155a546e0ec01a901d5 100644 (file)
@@ -33,6 +33,7 @@
 
 import sys
 from telemeta.core import *
+from datetime import date
 
 class DataMigrationTask(Component):
     
@@ -60,6 +61,18 @@ class DataMigrator(DataMigrationTask):
 
         return assign
 
+    def parse_year(self, year):
+        if year < 100 and year > 0:
+            if year <= date.today().year % 100:
+                return year + 2000
+            else:
+                return year + 1900
+        elif year >= 1900 and year <= date.today().year:
+            return year
+
+        return 0
+
+
 class DataInitializer(DataMigrationTask):
     pass
 
diff --git a/trunk/import/migration/tasks/dates.py b/trunk/import/migration/tasks/dates.py
new file mode 100644 (file)
index 0000000..61ff837
--- /dev/null
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+#
+# CREM Database migrator
+
+# Copyright (C) 2009 Samalyse SARL
+# Author: Olivier Guilyardi <olivier samalyse com>
+
+# This software is governed by the CeCILL  license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+from telemeta.core import *
+from api import IDataMigrator
+from core import DataMigrator
+import re
+from datetime import date, datetime
+
+class CollectionsYearConverter(DataMigrator):
+    """Perform a preliminary conversion of the collections recording years"""
+
+    implements(IDataMigrator)
+
+    def get_name(self):
+        return "dates:collections"
+
+    def process(self):
+        
+        self.stats = {
+            'total':          0,
+            'unsignificant':  0,
+            'unparsed':       0,
+            'migrated':       0,
+            'empty':          0,
+            'incoherent':     0
+        }
+   
+        self.src_cursor.execute("SELECT Cote, Annee_Enr FROM Support")
+
+        while True:
+            row = self.src_cursor.fetchone()
+            if not row:
+                break
+
+            old_code = row[0]                
+            year_str = row[1]
+
+            from_year = 0
+            to_year   = 0
+
+            self.stats['total'] += 1
+            if year_str == '':
+                self.stats['empty'] += 1
+            elif re.match('^[+-]?[Nn12?]$', year_str):
+                self.stats['unsignificant'] += 1
+            else:
+                match = re.match(r'^<?([0-9]{2,4})[;/,>-]\'?([0-9]{2,4})[\';/,>?-]?$', year_str)
+                if not match:
+                    match = re.match(r'^([0-9]{2,4})[;/,>?-]?$', year_str)
+                
+                if match:
+                    from_year = self.parse_year(int(match.group(1)))
+                    if match.lastindex > 1:
+                        to_year = self.parse_year(int(match.group(2)))
+                    else:
+                        to_year = from_year
+
+                    if from_year or to_year and (not to_year or to_year >= from_year):
+                        self.stats['migrated'] += 1
+                    else:    
+                        self.stats['incoherent'] += 1
+
+                else:
+                    self.stats['unparsed'] += 1
+
+            self.target_cursor.execute("UPDATE media_collections SET recorded_from_year = %s, "
+                                       "recorded_to_year = %s WHERE old_code = %s", (from_year, to_year, old_code))
+
+class ItemsDateConverter(DataMigrator):
+    """Convert items recording dates"""
+
+    implements(IDataMigrator)
+
+    def get_name(self):
+        return "dates:items"
+
+    def parse_month(self, str, patterns):
+        for i in patterns:
+            if re.match('^' + patterns[i] + '$', str, re.IGNORECASE):
+                return i
+        return 0
+
+    def last_day_of_month(self, month, year):
+        if month == 2:
+            if year % 4 == 0:
+                return 29
+            else:
+                return 28
+        elif month < 8:
+            if month % 2 == 1:
+                return 31
+            else:
+                return 30
+        else:
+            if month % 2 == 0:
+                return 31
+            else:
+                return 30
+
+    def parse_date_data(self, data, default_year, month_names):
+        
+        if data.has_key('year') and int(data['year']):
+            year = self.parse_year(int(data['year']))
+            if data.has_key('until_year'):
+                until_year = self.parse_year(int(data['until_year']))
+            else:
+                until_year = year
+        else:
+            year        = default_year
+            until_year  = year
+        
+        if data.has_key('name'):
+            data['month'] = self.parse_month(data['name'], month_names)
+
+        if data.has_key('until_name'):
+            data['until_month'] = self.parse_month(data['until_name'], month_names)
+
+        if data.has_key('month') and int(data['month']):
+            month = int(data['month'])
+            if data.has_key('until_month'):
+                until_month = int(data['until_month'])
+            else:
+                until_month = month
+        else:
+            month = 1
+            until_month = 12
+
+        if data.has_key('day') and int(data['day']):
+            day = int(data['day'])
+            if data.has_key('until_day'):
+                until_day = int(data['until_day'])
+            else:
+                until_day = day 
+        else:
+            day = 1
+            until_day = self.last_day_of_month(until_month, until_year)
+
+        try:
+            from_date = date(year, month, day)
+            try:
+                until_date = date(until_year, until_month, until_day)
+                if until_date >= from_date:
+                    return (from_date, until_date)
+                else:
+                    return (from_date, from_date)
+            except ValueError:
+                return (from_date, from_date)
+        except ValueError:
+            return None
+
+    def process(self):
+
+        names = {
+            1:  ur'jan(vier|v\.?|\.?)',
+            2:  ur'f[eé]v(rier|r\.?|\.?)',     
+            3:  ur'mar(s|\.?)',           
+            4:  ur'avr(il|\.?)',          
+            5:  ur'mai',                  
+            6:  ur'juin',                 
+            7:  ur'juil(let|\.?)',        
+            8:  ur'a[ôo][ûu]t',              
+            9:  ur'sep(tembre|t\.?|\.?)', 
+            10: ur'oct(obre|\.?)',        
+            11: ur'd[ée]c(embre|\.?)',
+            12: ur'nov(embre|\.?)'
+        }
+
+        namepattern = u'|'.join([names[i] for i in names])
+
+        patterns = [
+            # month only
+            ur'^ *(?P<name>' + namepattern  + ') *$',
+
+            # day and month, no year
+            ur'^ *(?P<day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *$',
+
+            # from and until month
+            ur'^ *(?P<name>' + namepattern  + ') *[,-] *(?P<until_name>' + namepattern  + ') *$',
+
+            # from year only
+            ur'^ *(été|) *(?P<year>[0-9]{2,4})[ ?]*$',
+            
+            # from and until year
+            ur'^ *(?P<year>[0-9]{4}) *[/,-] *(?P<until_year>[0-9]{2,4}) *$',
+
+            # month and year
+            ur'^ *(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{2,4}) *$',
+            ur'^ *(?P<name>' + namepattern  + ') *\.? *(?P<year>[0-9]{2,4}) *-'
+               ' *(?P<until_name>' + namepattern  + ') *\.? *(?P<until_year>[0-9]{2,4}) *$',
+            ur'^ *(début|mi|fin|) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[ ?]*$',
+
+            # from month, until month, and year
+            ur'^ *(?P<month>[0-9]{1,2}) *[aà] *(?P<until_month>[0-9]{1,2}) *- *(?P<year>[0-9]{4}) *$',
+            ur'^ *(?P<name>' + namepattern  + ') *[,/-] *(?P<until_name>' + namepattern  + ') *[./]? *(?P<year>[0-9]{2,4}) *$',
+            
+            # from month+year, until month+year
+            ur'^ *(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{2,4}) *; *(?P<until_month>[0-9]{1,2}) *[_./-] *(?P<until_year>[0-9]{2,4}) *$',
+            ur'^ *(?P<month>[0-9]{1,2}) */ *(?P<year>[0-9]{2,4}) *- *(?P<until_month>[0-9]{1,2}) */ *(?P<until_year>[0-9]{2,4}) *$',
+            
+            # day, month and year
+            ur'^ *(?P<day>[0-9]{1,2}) *[_./-] *0?(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{1,4}) *$',
+            ur'^ *(?P<year>[0-9]{4}) *[_./-] *(?P<month>[0-9]{2}) *[_./-] *(?P<day>[0-9]{2}) *$',
+            ur'^ *(?P<day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[? ]*$',
+
+            # from+until day, single month and year
+            ur'^ *(?P<day>[0-9]{1,2})[ &+,-]+(?P<until_day>[0-9]{1,2}) *[ _./-] *(?P<month>[0-9]{1,2}) *[ _./-] *(?P<year>[0-9]{2,4}) *$',
+            ur'^ *(?P<day>[0-9]{1,2}) *(-|/|et|au) *(?P<until_day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[? ]*$',
+
+        ]
+
+        self.src_cursor.execute("SELECT COUNT(*) FROM Phono")
+        self.stats = {
+            'total'     : self.src_cursor.fetchone()[0],
+            'matched'       : 0,
+            'converted'     : 0,
+            'unsignificant' : 0,
+            'unparsed'      : 0,
+            'incoherent'    : 0,
+            'nosuchitem'    : 0
+        }
+
+        self.target_cursor.execute("UPDATE media_items SET recorded_from_date = 0, recorded_to_date = 0")
+
+        self.src_cursor.execute("SELECT Cote_Phono, Annee_Enreg, Dates_Enregistr FROM Phono")
+
+        i = 0
+        while True:
+            row = self.src_cursor.fetchone()
+            if not row:
+                break
+
+            old_code        = row[0]
+            year_str        = row[1]
+            default_year    = self.parse_year(int(year_str))
+            date_str        = row[2]
+
+            data = {}
+            if re.match(r'^ *(nn?|=|id|idem|\?+|[-_]?1|[0 /]*) *$', date_str, re.IGNORECASE):
+                self.stats['unsignificant'] += 1
+            else:
+                for p in patterns:
+                    match = re.match(p, date_str, re.IGNORECASE)
+                    if match:
+                        self.stats['matched'] += 1
+                        data = match.groupdict()
+                        break
+
+            if data or default_year:
+                dates = self.parse_date_data(data, default_year, names)
+                if dates:
+                    self.target_cursor.execute('UPDATE media_items SET recorded_from_date = %s, recorded_to_date = %s '
+                                               'WHERE old_code = %s', 
+                                               (dates[0].strftime('%Y-%m-%d'), dates[1].strftime('%Y-%m-%d'), old_code))
+                    if self.target_cursor.rowcount:
+                        self.stats['converted'] += 1
+                    else:
+                        self.stats['nosuchitem'] += 1
+                else:
+                    print "incoherent date (%s): %s -- %s" % (old_code, str(data), str(row))
+                    self.stats['incoherent'] += 1
+                    
+            else:
+                #print '|%s|' % row[0]
+                self.stats['unparsed'] += 1
+
+            i += 1
+            if i % 1000 == 0:
+                self.step()
+
+class DateRangeSynchronizer(DataMigrator):
+    "Ensures collections recording years contain items recording dates"
+    
+    implements(IDataMigrator)
+
+    def get_name(self):
+        return "dates:sync"
+
+    def process(self):
+        self.stats = {'synced' : 0}
+
+        self.target_cursor.execute("SELECT id, recorded_from_year, recorded_to_year FROM media_collections")
+
+        items_cursor = self.target_db.cursor()
+        update_cursor = self.target_db.cursor()
+        i = 0
+        while True:
+            row = self.target_cursor.fetchone()
+            if not row:
+                break
+
+            collection_id = row[0]
+            from_year     = row[1]
+            to_year       = row[2]
+
+            synced = False
+
+            items_cursor.execute("SELECT MIN(YEAR(recorded_from_date)) FROM media_items "
+                                 "WHERE collection_id = %s AND recorded_from_date <> 0", (collection_id,))
+            itemdata = items_cursor.fetchone()
+            if itemdata:
+                item_min_year = itemdata[0]
+                if item_min_year and (not from_year or item_min_year < from_year):
+                    update_cursor.execute("UPDATE media_collections SET recorded_from_year = %s WHERE id = %s", (item_min_year,collection_id))
+                    synced = True
+
+            items_cursor.execute("SELECT MAX(YEAR(recorded_to_date)) FROM media_items "
+                                 "WHERE collection_id = %s AND recorded_to_date <> 0", (collection_id,))
+            itemdata = items_cursor.fetchone()
+            if itemdata:
+                item_max_year = itemdata[0]
+                if item_max_year and (not to_year or item_max_year > to_year):
+                    update_cursor.execute("UPDATE media_collections SET recorded_to_year = %s WHERE id = %s", (item_max_year,collection_id))
+                    synced = True
+
+            if synced:
+                self.stats['synced'] += 1
+
+            if i % 400 == 0:
+                self.step()
+            i += 1
+                        
+            
+        
+
+
+
index 5d1d5299b137d28496d9f42869029fe846543cfb..d576f8266b2c689bbf95744429807349a60a5652 100644 (file)
@@ -36,7 +36,6 @@ from api import IDataMigrator
 from core import DataMigrator, EnumMapper
 from _mysql_exceptions import IntegrityError
 from MySQLdb.constants.ER import DUP_ENTRY
-import re
 
 class ItemsCopyMigrator(DataMigrator):
     """Perform a preliminary raw copy of the item table"""
@@ -117,111 +116,3 @@ class ItemsEnumMapper(EnumMapper):
     def process(self):
         EnumMapper.process(self, 'Phono', 'Cote_Phono', 'media_items', self.map)
 
-class ItemsDateConverter(DataMigrator):
-    """Convert items recording dates"""
-
-    implements(IDataMigrator)
-
-    def get_name(self):
-        return "items:date"
-
-    def process(self):
-
-        names = {
-            1:  ur'jan(vier|v\.?|\.?)',
-            2:  ur'f[eé]v(rier|r\.?|\.?)',     
-            3:  ur'mar(s|\.?)',           
-            4:  ur'avr(il|\.?)',          
-            5:  ur'mai',                  
-            6:  ur'juin',                 
-            7:  ur'juil(let|\.?)',        
-            8:  ur'a[ôo][ûu]t',              
-            9:  ur'sep(tembre|t\.?|\.?)', 
-            10: ur'oct(obre|\.?)',        
-            11: ur'd[ée]c(embre|\.?)',
-            12: ur'nov(embre|\.?)'
-        }
-
-        namepattern = u'|'.join([names[i] for i in names])
-
-        patterns = [
-            # month only
-            ur'^ *(?P<name>' + namepattern  + ') *$',
-
-            # day and month, no year
-            ur'^ *(?P<day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *$',
-
-            # from and until month
-            ur'^ *(?P<name>' + namepattern  + ') *[,-] *(?P<until_name>' + namepattern  + ') *$',
-
-            # from year only
-            ur'^ *(été|) *(?P<year>[0-9]{2,4})[ ?]*$',
-            
-            # from and until year
-            ur'^ *(?P<year>[0-9]{4}) *[/,-] *(?P<until_year>[0-9]{2,4}) *$',
-
-            # month and year
-            ur'^ *(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{2,4}) *$',
-            ur'^ *(?P<name>' + namepattern  + ') *\.? *(?P<year>[0-9]{2,4}) *-'
-               ' *(?P<until_name>' + namepattern  + ') *\.? *(?P<until_year>[0-9]{2,4}) *$',
-            ur'^ *(début|mi|fin|) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[ ?]*$',
-
-            # from month, until month, and year
-            ur'^ *(?P<month>[0-9]{1,2}) *[aà] *(?P<until_month>[0-9]{1,2}) *- *(?P<year>[0-9]{4}) *$',
-            ur'^ *(?P<name>' + namepattern  + ') *[,/-] *(?P<until_name>' + namepattern  + ') *[./]? *(?P<year>[0-9]{2,4}) *$',
-            
-            # from month+year, until month+year
-            ur'^ *(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{2,4}) *; *(?P<until_month>[0-9]{1,2}) *[_./-] *(?P<until_year>[0-9]{2,4}) *$',
-            ur'^ *(?P<month>[0-9]{1,2}) */ *(?P<year>[0-9]{2,4}) *- *(?P<until_month>[0-9]{1,2}) */ *(?P<until_year>[0-9]{2,4}) *$',
-            
-            # day, month and year
-            ur'^ *(?P<day>[0-9]{1,2}) *[_./-] *0?(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{1,4}) *$',
-            ur'^ *(?P<year>[0-9]{4}) *[_./-] *(?P<month>[0-9]{2}) *[_./-] *(?P<day>[0-9]{2}) *$',
-            ur'^ *(?P<day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[? ]*$',
-
-            # from+until day, single month and year
-            ur'^ *(?P<day>[0-9]{1,2})[ &+,-]+(?P<until_day>[0-9]{1,2}) *[ _./-] *(?P<month>[0-9]{1,2}) *[ _./-] *(?P<year>[0-9]{2,4}) *$',
-            ur'^ *(?P<day>[0-9]{1,2}) *(-|/|et|au) *(?P<until_day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[? ]*$',
-
-        ]
-
-        self.src_cursor.execute("SELECT COUNT(*) FROM Phono")
-        self.stats = {
-            'total'     : self.src_cursor.fetchone()[0],
-            'matched'       : 0,
-            'empty'         : 0,
-            'unsignificant' : 0,
-            'unparsed'      : 0
-        }
-
-        self.src_cursor.execute("SELECT COUNT(*) FROM Phono WHERE Dates_Enregistr REGEXP '^ *$'")
-        self.stats['empty'] = self.src_cursor.fetchone()[0]
-
-        self.src_cursor.execute("SELECT Dates_Enregistr FROM Phono WHERE Dates_Enregistr NOT REGEXP '^ *$'")
-
-        print "WARNING: this migrator is a work in progress"
-        while True:
-            row = self.src_cursor.fetchone()
-            if not row:
-                break
-
-            recognized = False
-            if re.match('^ *(nn?|=|id|idem|\?+|[-_]?1|[0 ]*) *$', row[0], re.IGNORECASE):
-                self.stats['unsignificant'] += 1
-                recognized = True
-            else:
-                for p in patterns:
-                    if re.match(p, row[0], re.IGNORECASE):
-                        self.stats['matched'] += 1
-                        recognized = True
-                        break
-
-            if not recognized:
-                #print '|%s|' % row[0]
-                self.stats['unparsed'] += 1
-
-                    
-
-
-
-