]> git.parisson.com Git - telemeta-data.git/commitdiff
migration: start to implement items date converter (some regex for now)
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Wed, 10 Jun 2009 21:39:33 +0000 (21:39 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Wed, 10 Jun 2009 21:39:33 +0000 (21:39 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@96 3bf09e05-f825-4182-b9bc-eedd7160adf0

trunk/import/migration/tasks/items.py

index d576f8266b2c689bbf95744429807349a60a5652..06ffa3fb05491453502d428f543f4f5d6422f346 100644 (file)
@@ -36,6 +36,7 @@ from api import IDataMigrator
 from core import DataMigrator, EnumMapper
 from _mysql_exceptions import IntegrityError
 from MySQLdb.constants.ER import DUP_ENTRY
+import re
 
 class ItemsCopyMigrator(DataMigrator):
     """Perform a preliminary raw copy of the item table"""
@@ -116,3 +117,109 @@ class ItemsEnumMapper(EnumMapper):
     def process(self):
         EnumMapper.process(self, 'Phono', 'Cote_Phono', 'media_items', self.map)
 
+class ItemsDateConverter(DataMigrator):
+    """Convert items recording dates"""
+
+    implements(IDataMigrator)
+
+    def get_name(self):
+        return "items:date"
+
+    def process(self):
+
+        names = {
+            1:  ur'jan(vier|v\.?|\.?)',
+            2:  ur'f[eé]v(rier|r\.?|\.?)',     
+            3:  ur'mar(s|\.?)',           
+            4:  ur'avr(il|\.?)',          
+            5:  ur'mai',                  
+            6:  ur'juin',                 
+            7:  ur'juil(let|\.?)',        
+            8:  ur'a[ôo][ûu]t',              
+            9:  ur'sep(tembre|t\.?|\.?)', 
+            10: ur'oct(obre|\.?)',        
+            11: ur'd[ée]c(embre|\.?)',
+            12: ur'nov(embre|\.?)'
+        }
+
+        namepattern = u'|'.join([names[i] for i in names])
+
+        patterns = [
+            # month only
+            ur'^ *(?P<name>' + namepattern  + ') *$',
+
+            # day and month, no year
+            ur'^ *(?P<day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *$',
+
+            # from and until month
+            ur'^ *(?P<name>' + namepattern  + ') *[,-] *(?P<until_name>' + namepattern  + ') *$',
+
+            # from year only
+            ur'^ *(été|) *(?P<year>[0-9]{2,4})[ ?]*$',
+            
+            # from and until year
+            ur'^ *(?P<year>[0-9]{4}) *[/,-] *(?P<until_year>[0-9]{2,4}) *$',
+
+            # month and year
+            ur'^ *(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{2,4}) *$',
+            ur'^ *(?P<name>' + namepattern  + ') *\.? *(?P<year>[0-9]{2,4}) *-'
+               ' *(?P<until_name>' + namepattern  + ') *\.? *(?P<until_year>[0-9]{2,4}) *$',
+            ur'^ *(début|mi|fin|) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[ ?]*$',
+
+            # from month, until month, and year
+            ur'^ *(?P<month>[0-9]{1,2}) *[aà] *(?P<until_month>[0-9]{1,2}) *- *(?P<year>[0-9]{4}) *$',
+            ur'^ *(?P<name>' + namepattern  + ') *[,/-] *(?P<until_name>' + namepattern  + ') *[./]? *(?P<year>[0-9]{2,4}) *$',
+            
+            # from month+year, until month+year
+            ur'^ *(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{2,4}) *; *(?P<until_month>[0-9]{1,2}) *[_./-] *(?P<until_year>[0-9]{2,4}) *$',
+            ur'^ *(?P<month>[0-9]{1,2}) */ *(?P<year>[0-9]{2,4}) *- *(?P<until_month>[0-9]{1,2}) */ *(?P<until_year>[0-9]{2,4}) *$',
+            
+            # day, month and year
+            ur'^ *(?P<day>[0-9]{1,2}) *[_./-] *0?(?P<month>[0-9]{1,2}) *[_./-] *(?P<year>[0-9]{1,4}) *$',
+            ur'^ *(?P<year>[0-9]{4}) *[_./-] *(?P<month>[0-9]{2}) *[_./-] *(?P<day>[0-9]{2}) *$',
+            ur'^ *(?P<day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[? ]*$',
+
+            # from+until day, single month and year
+            ur'^ *(?P<day>[0-9]{1,2})[ &+,-]+(?P<until_day>[0-9]{1,2}) *[ _./-] *(?P<month>[0-9]{1,2}) *[ _./-] *(?P<year>[0-9]{2,4}) *$',
+            ur'^ *(?P<day>[0-9]{1,2}) *(-|/|et|au) *(?P<until_day>[0-9]{1,2}) *(?P<name>' + namepattern  + ') *(?P<year>[0-9]{2,4})[? ]*$',
+
+        ]
+
+        self.src_cursor.execute("SELECT COUNT(*) FROM Phono")
+        self.stats = {
+            'total'     : self.src_cursor.fetchone()[0],
+            'matched'       : 0,
+            'empty'         : 0,
+            'unsignificant' : 0,
+            'unparsed'      : 0
+        }
+
+        self.src_cursor.execute("SELECT COUNT(*) FROM Phono WHERE Dates_Enregistr REGEXP '^ *$'")
+        self.stats['empty'] = self.src_cursor.fetchone()[0]
+
+        self.src_cursor.execute("SELECT Dates_Enregistr FROM Phono WHERE Dates_Enregistr NOT REGEXP '^ *$'")
+        while True:
+            row = self.src_cursor.fetchone()
+            if not row:
+                break
+
+            recognized = False
+            if re.match('^ *(nn?|=|id|idem|\?+|[-_]?1|[0 ]*) *$', row[0], re.IGNORECASE):
+                self.stats['unsignificant'] += 1
+                recognized = True
+            else:
+                for p in patterns:
+                    if re.match(p, row[0], re.IGNORECASE):
+                        self.stats['matched'] += 1
+                        recognized = True
+                        break
+
+            if not recognized:
+                #print '|%s|' % row[0]
+                self.stats['unparsed'] += 1
+
+                    
+
+
+
+