From 059fe4b29b392721d00d1676940bf88b31848621 Mon Sep 17 00:00:00 2001 From: olivier Date: Wed, 10 Jun 2009 21:39:33 +0000 Subject: [PATCH] migration: start to implement items date converter (some regex for now) git-svn-id: http://svn.parisson.org/svn/crem@96 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- trunk/import/migration/tasks/items.py | 107 ++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py index d576f82..06ffa3f 100644 --- a/trunk/import/migration/tasks/items.py +++ b/trunk/import/migration/tasks/items.py @@ -36,6 +36,7 @@ from api import IDataMigrator from core import DataMigrator, EnumMapper from _mysql_exceptions import IntegrityError from MySQLdb.constants.ER import DUP_ENTRY +import re class ItemsCopyMigrator(DataMigrator): """Perform a preliminary raw copy of the item table""" @@ -116,3 +117,109 @@ class ItemsEnumMapper(EnumMapper): def process(self): EnumMapper.process(self, 'Phono', 'Cote_Phono', 'media_items', self.map) +class ItemsDateConverter(DataMigrator): + """Convert items recording dates""" + + implements(IDataMigrator) + + def get_name(self): + return "items:date" + + def process(self): + + names = { + 1: ur'jan(vier|v\.?|\.?)', + 2: ur'f[eé]v(rier|r\.?|\.?)', + 3: ur'mar(s|\.?)', + 4: ur'avr(il|\.?)', + 5: ur'mai', + 6: ur'juin', + 7: ur'juil(let|\.?)', + 8: ur'a[ôo][ûu]t', + 9: ur'sep(tembre|t\.?|\.?)', + 10: ur'oct(obre|\.?)', + 11: ur'd[ée]c(embre|\.?)', + 12: ur'nov(embre|\.?)' + } + + namepattern = u'|'.join([names[i] for i in names]) + + patterns = [ + # month only + ur'^ *(?P' + namepattern + ') *$', + + # day and month, no year + ur'^ *(?P[0-9]{1,2}) *(?P' + namepattern + ') *$', + + # from and until month + ur'^ *(?P' + namepattern + ') *[,-] *(?P' + namepattern + ') *$', + + # from year only + ur'^ *(été|) *(?P[0-9]{2,4})[ ?]*$', + + # from and until year + ur'^ *(?P[0-9]{4}) *[/,-] *(?P[0-9]{2,4}) *$', + + # month and year + ur'^ *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *$', + ur'^ *(?P' + namepattern + ') *\.? *(?P[0-9]{2,4}) *-' + ' *(?P' + namepattern + ') *\.? *(?P[0-9]{2,4}) *$', + ur'^ *(début|mi|fin|) *(?P' + namepattern + ') *(?P[0-9]{2,4})[ ?]*$', + + # from month, until month, and year + ur'^ *(?P[0-9]{1,2}) *[aà] *(?P[0-9]{1,2}) *- *(?P[0-9]{4}) *$', + ur'^ *(?P' + namepattern + ') *[,/-] *(?P' + namepattern + ') *[./]? *(?P[0-9]{2,4}) *$', + + # from month+year, until month+year + ur'^ *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *; *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *$', + ur'^ *(?P[0-9]{1,2}) */ *(?P[0-9]{2,4}) *- *(?P[0-9]{1,2}) */ *(?P[0-9]{2,4}) *$', + + # day, month and year + ur'^ *(?P[0-9]{1,2}) *[_./-] *0?(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{1,4}) *$', + ur'^ *(?P[0-9]{4}) *[_./-] *(?P[0-9]{2}) *[_./-] *(?P[0-9]{2}) *$', + ur'^ *(?P[0-9]{1,2}) *(?P' + namepattern + ') *(?P[0-9]{2,4})[? ]*$', + + # from+until day, single month and year + ur'^ *(?P[0-9]{1,2})[ &+,-]+(?P[0-9]{1,2}) *[ _./-] *(?P[0-9]{1,2}) *[ _./-] *(?P[0-9]{2,4}) *$', + ur'^ *(?P[0-9]{1,2}) *(-|/|et|au) *(?P[0-9]{1,2}) *(?P' + namepattern + ') *(?P[0-9]{2,4})[? ]*$', + + ] + + self.src_cursor.execute("SELECT COUNT(*) FROM Phono") + self.stats = { + 'total' : self.src_cursor.fetchone()[0], + 'matched' : 0, + 'empty' : 0, + 'unsignificant' : 0, + 'unparsed' : 0 + } + + self.src_cursor.execute("SELECT COUNT(*) FROM Phono WHERE Dates_Enregistr REGEXP '^ *$'") + self.stats['empty'] = self.src_cursor.fetchone()[0] + + self.src_cursor.execute("SELECT Dates_Enregistr FROM Phono WHERE Dates_Enregistr NOT REGEXP '^ *$'") + while True: + row = self.src_cursor.fetchone() + if not row: + break + + recognized = False + if re.match('^ *(nn?|=|id|idem|\?+|[-_]?1|[0 ]*) *$', row[0], re.IGNORECASE): + self.stats['unsignificant'] += 1 + recognized = True + else: + for p in patterns: + if re.match(p, row[0], re.IGNORECASE): + self.stats['matched'] += 1 + recognized = True + break + + if not recognized: + #print '|%s|' % row[0] + self.stats['unparsed'] += 1 + + + + + + -- 2.39.5