From a4658c4f1048cb657463e75a7ed9ae66f48c9e7d Mon Sep 17 00:00:00 2001 From: olivier Date: Thu, 11 Jun 2009 16:02:59 +0000 Subject: [PATCH] migration: moved dates-related tasks to a specific module, improved items date converter, add collections/items dates synchronizer git-svn-id: http://svn.parisson.org/svn/crem@98 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- trunk/import/migration/tasks/__init__.py | 1 + trunk/import/migration/tasks/collections.py | 70 ---- trunk/import/migration/tasks/core.py | 13 + trunk/import/migration/tasks/dates.py | 357 ++++++++++++++++++++ trunk/import/migration/tasks/items.py | 109 ------ 5 files changed, 371 insertions(+), 179 deletions(-) create mode 100644 trunk/import/migration/tasks/dates.py diff --git a/trunk/import/migration/tasks/__init__.py b/trunk/import/migration/tasks/__init__.py index 5598a9f..b28e9b8 100644 --- a/trunk/import/migration/tasks/__init__.py +++ b/trunk/import/migration/tasks/__init__.py @@ -38,3 +38,4 @@ import ethnic import publishers import collections import items +import dates diff --git a/trunk/import/migration/tasks/collections.py b/trunk/import/migration/tasks/collections.py index e37bfb4..509f384 100644 --- a/trunk/import/migration/tasks/collections.py +++ b/trunk/import/migration/tasks/collections.py @@ -37,7 +37,6 @@ from core import DataMigrator, EnumMapper from _mysql_exceptions import IntegrityError from MySQLdb.constants.ER import DUP_ENTRY import re -from datetime import date class CollectionsCopyMigrator(DataMigrator): """Perform a preliminary raw copy of the collection table""" @@ -334,75 +333,6 @@ class CollectionsPublishersMapper(DataMigrator): self.target_cursor.execute("SELECT COUNT(*) FROM media_collections WHERE publisher_collection_id IS NOT NULL") self.stats['with_collection'] = self.target_cursor.fetchone()[0] -class CollectionsYearConverter(DataMigrator): - """Convert collection recording years""" - - implements(IDataMigrator) - - def get_name(self): - return "collections:date" - - def parse_year(self, year): - if year < 100: - return year + 1900 - elif year >= 1900 and year <= date.today().year: - return year - - return 0 - - def process(self): - - self.stats = { - 'total': 0, - 'unsignificant': 0, - 'unparsed': 0, - 'migrated': 0, - 'empty': 0, - 'incoherent': 0 - } - - self.src_cursor.execute("SELECT Cote, Annee_Enr FROM Support") - - while True: - row = self.src_cursor.fetchone() - if not row: - break - - old_code = row[0] - year_str = row[1] - - from_year = 0 - to_year = 0 - - self.stats['total'] += 1 - if year_str == '': - self.stats['empty'] += 1 - elif re.match('^[+-]?[Nn12?]$', year_str): - self.stats['unsignificant'] += 1 - else: - match = re.match(r'^-]\'?([0-9]{2,4})[\';/,>?-]?$', year_str) - if not match: - match = re.match(r'^([0-9]{2,4})[;/,>?-]?$', year_str) - - if match: - from_year = self.parse_year(int(match.group(1))) - if match.lastindex > 1: - to_year = self.parse_year(int(match.group(2))) - else: - to_year = from_year - - if from_year or to_year and (not to_year or to_year >= from_year): - self.stats['migrated'] += 1 - else: - self.stats['incoherent'] += 1 - - else: - self.stats['unparsed'] += 1 - - self.target_cursor.execute("UPDATE media_collections SET recorded_from_year = %s, " - "recorded_to_year = %s WHERE old_code = %s", (from_year, to_year, old_code)) - - class CollectionsCollectorNormalizer(DataMigrator): """Migrate collector_is_creator flag""" diff --git a/trunk/import/migration/tasks/core.py b/trunk/import/migration/tasks/core.py index 85fdad3..e26b743 100644 --- a/trunk/import/migration/tasks/core.py +++ b/trunk/import/migration/tasks/core.py @@ -33,6 +33,7 @@ import sys from telemeta.core import * +from datetime import date class DataMigrationTask(Component): @@ -60,6 +61,18 @@ class DataMigrator(DataMigrationTask): return assign + def parse_year(self, year): + if year < 100 and year > 0: + if year <= date.today().year % 100: + return year + 2000 + else: + return year + 1900 + elif year >= 1900 and year <= date.today().year: + return year + + return 0 + + class DataInitializer(DataMigrationTask): pass diff --git a/trunk/import/migration/tasks/dates.py b/trunk/import/migration/tasks/dates.py new file mode 100644 index 0000000..61ff837 --- /dev/null +++ b/trunk/import/migration/tasks/dates.py @@ -0,0 +1,357 @@ +# -*- coding: utf-8 -*- +# +# CREM Database migrator + +# Copyright (C) 2009 Samalyse SARL +# Author: Olivier Guilyardi + +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". + +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. + +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. + +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from telemeta.core import * +from api import IDataMigrator +from core import DataMigrator +import re +from datetime import date, datetime + +class CollectionsYearConverter(DataMigrator): + """Perform a preliminary conversion of the collections recording years""" + + implements(IDataMigrator) + + def get_name(self): + return "dates:collections" + + def process(self): + + self.stats = { + 'total': 0, + 'unsignificant': 0, + 'unparsed': 0, + 'migrated': 0, + 'empty': 0, + 'incoherent': 0 + } + + self.src_cursor.execute("SELECT Cote, Annee_Enr FROM Support") + + while True: + row = self.src_cursor.fetchone() + if not row: + break + + old_code = row[0] + year_str = row[1] + + from_year = 0 + to_year = 0 + + self.stats['total'] += 1 + if year_str == '': + self.stats['empty'] += 1 + elif re.match('^[+-]?[Nn12?]$', year_str): + self.stats['unsignificant'] += 1 + else: + match = re.match(r'^-]\'?([0-9]{2,4})[\';/,>?-]?$', year_str) + if not match: + match = re.match(r'^([0-9]{2,4})[;/,>?-]?$', year_str) + + if match: + from_year = self.parse_year(int(match.group(1))) + if match.lastindex > 1: + to_year = self.parse_year(int(match.group(2))) + else: + to_year = from_year + + if from_year or to_year and (not to_year or to_year >= from_year): + self.stats['migrated'] += 1 + else: + self.stats['incoherent'] += 1 + + else: + self.stats['unparsed'] += 1 + + self.target_cursor.execute("UPDATE media_collections SET recorded_from_year = %s, " + "recorded_to_year = %s WHERE old_code = %s", (from_year, to_year, old_code)) + +class ItemsDateConverter(DataMigrator): + """Convert items recording dates""" + + implements(IDataMigrator) + + def get_name(self): + return "dates:items" + + def parse_month(self, str, patterns): + for i in patterns: + if re.match('^' + patterns[i] + '$', str, re.IGNORECASE): + return i + return 0 + + def last_day_of_month(self, month, year): + if month == 2: + if year % 4 == 0: + return 29 + else: + return 28 + elif month < 8: + if month % 2 == 1: + return 31 + else: + return 30 + else: + if month % 2 == 0: + return 31 + else: + return 30 + + def parse_date_data(self, data, default_year, month_names): + + if data.has_key('year') and int(data['year']): + year = self.parse_year(int(data['year'])) + if data.has_key('until_year'): + until_year = self.parse_year(int(data['until_year'])) + else: + until_year = year + else: + year = default_year + until_year = year + + if data.has_key('name'): + data['month'] = self.parse_month(data['name'], month_names) + + if data.has_key('until_name'): + data['until_month'] = self.parse_month(data['until_name'], month_names) + + if data.has_key('month') and int(data['month']): + month = int(data['month']) + if data.has_key('until_month'): + until_month = int(data['until_month']) + else: + until_month = month + else: + month = 1 + until_month = 12 + + if data.has_key('day') and int(data['day']): + day = int(data['day']) + if data.has_key('until_day'): + until_day = int(data['until_day']) + else: + until_day = day + else: + day = 1 + until_day = self.last_day_of_month(until_month, until_year) + + try: + from_date = date(year, month, day) + try: + until_date = date(until_year, until_month, until_day) + if until_date >= from_date: + return (from_date, until_date) + else: + return (from_date, from_date) + except ValueError: + return (from_date, from_date) + except ValueError: + return None + + def process(self): + + names = { + 1: ur'jan(vier|v\.?|\.?)', + 2: ur'f[eé]v(rier|r\.?|\.?)', + 3: ur'mar(s|\.?)', + 4: ur'avr(il|\.?)', + 5: ur'mai', + 6: ur'juin', + 7: ur'juil(let|\.?)', + 8: ur'a[ôo][ûu]t', + 9: ur'sep(tembre|t\.?|\.?)', + 10: ur'oct(obre|\.?)', + 11: ur'd[ée]c(embre|\.?)', + 12: ur'nov(embre|\.?)' + } + + namepattern = u'|'.join([names[i] for i in names]) + + patterns = [ + # month only + ur'^ *(?P' + namepattern + ') *$', + + # day and month, no year + ur'^ *(?P[0-9]{1,2}) *(?P' + namepattern + ') *$', + + # from and until month + ur'^ *(?P' + namepattern + ') *[,-] *(?P' + namepattern + ') *$', + + # from year only + ur'^ *(été|) *(?P[0-9]{2,4})[ ?]*$', + + # from and until year + ur'^ *(?P[0-9]{4}) *[/,-] *(?P[0-9]{2,4}) *$', + + # month and year + ur'^ *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *$', + ur'^ *(?P' + namepattern + ') *\.? *(?P[0-9]{2,4}) *-' + ' *(?P' + namepattern + ') *\.? *(?P[0-9]{2,4}) *$', + ur'^ *(début|mi|fin|) *(?P' + namepattern + ') *(?P[0-9]{2,4})[ ?]*$', + + # from month, until month, and year + ur'^ *(?P[0-9]{1,2}) *[aà] *(?P[0-9]{1,2}) *- *(?P[0-9]{4}) *$', + ur'^ *(?P' + namepattern + ') *[,/-] *(?P' + namepattern + ') *[./]? *(?P[0-9]{2,4}) *$', + + # from month+year, until month+year + ur'^ *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *; *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *$', + ur'^ *(?P[0-9]{1,2}) */ *(?P[0-9]{2,4}) *- *(?P[0-9]{1,2}) */ *(?P[0-9]{2,4}) *$', + + # day, month and year + ur'^ *(?P[0-9]{1,2}) *[_./-] *0?(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{1,4}) *$', + ur'^ *(?P[0-9]{4}) *[_./-] *(?P[0-9]{2}) *[_./-] *(?P[0-9]{2}) *$', + ur'^ *(?P[0-9]{1,2}) *(?P' + namepattern + ') *(?P[0-9]{2,4})[? ]*$', + + # from+until day, single month and year + ur'^ *(?P[0-9]{1,2})[ &+,-]+(?P[0-9]{1,2}) *[ _./-] *(?P[0-9]{1,2}) *[ _./-] *(?P[0-9]{2,4}) *$', + ur'^ *(?P[0-9]{1,2}) *(-|/|et|au) *(?P[0-9]{1,2}) *(?P' + namepattern + ') *(?P[0-9]{2,4})[? ]*$', + + ] + + self.src_cursor.execute("SELECT COUNT(*) FROM Phono") + self.stats = { + 'total' : self.src_cursor.fetchone()[0], + 'matched' : 0, + 'converted' : 0, + 'unsignificant' : 0, + 'unparsed' : 0, + 'incoherent' : 0, + 'nosuchitem' : 0 + } + + self.target_cursor.execute("UPDATE media_items SET recorded_from_date = 0, recorded_to_date = 0") + + self.src_cursor.execute("SELECT Cote_Phono, Annee_Enreg, Dates_Enregistr FROM Phono") + + i = 0 + while True: + row = self.src_cursor.fetchone() + if not row: + break + + old_code = row[0] + year_str = row[1] + default_year = self.parse_year(int(year_str)) + date_str = row[2] + + data = {} + if re.match(r'^ *(nn?|=|id|idem|\?+|[-_]?1|[0 /]*) *$', date_str, re.IGNORECASE): + self.stats['unsignificant'] += 1 + else: + for p in patterns: + match = re.match(p, date_str, re.IGNORECASE) + if match: + self.stats['matched'] += 1 + data = match.groupdict() + break + + if data or default_year: + dates = self.parse_date_data(data, default_year, names) + if dates: + self.target_cursor.execute('UPDATE media_items SET recorded_from_date = %s, recorded_to_date = %s ' + 'WHERE old_code = %s', + (dates[0].strftime('%Y-%m-%d'), dates[1].strftime('%Y-%m-%d'), old_code)) + if self.target_cursor.rowcount: + self.stats['converted'] += 1 + else: + self.stats['nosuchitem'] += 1 + else: + print "incoherent date (%s): %s -- %s" % (old_code, str(data), str(row)) + self.stats['incoherent'] += 1 + + else: + #print '|%s|' % row[0] + self.stats['unparsed'] += 1 + + i += 1 + if i % 1000 == 0: + self.step() + +class DateRangeSynchronizer(DataMigrator): + "Ensures collections recording years contain items recording dates" + + implements(IDataMigrator) + + def get_name(self): + return "dates:sync" + + def process(self): + self.stats = {'synced' : 0} + + self.target_cursor.execute("SELECT id, recorded_from_year, recorded_to_year FROM media_collections") + + items_cursor = self.target_db.cursor() + update_cursor = self.target_db.cursor() + i = 0 + while True: + row = self.target_cursor.fetchone() + if not row: + break + + collection_id = row[0] + from_year = row[1] + to_year = row[2] + + synced = False + + items_cursor.execute("SELECT MIN(YEAR(recorded_from_date)) FROM media_items " + "WHERE collection_id = %s AND recorded_from_date <> 0", (collection_id,)) + itemdata = items_cursor.fetchone() + if itemdata: + item_min_year = itemdata[0] + if item_min_year and (not from_year or item_min_year < from_year): + update_cursor.execute("UPDATE media_collections SET recorded_from_year = %s WHERE id = %s", (item_min_year,collection_id)) + synced = True + + items_cursor.execute("SELECT MAX(YEAR(recorded_to_date)) FROM media_items " + "WHERE collection_id = %s AND recorded_to_date <> 0", (collection_id,)) + itemdata = items_cursor.fetchone() + if itemdata: + item_max_year = itemdata[0] + if item_max_year and (not to_year or item_max_year > to_year): + update_cursor.execute("UPDATE media_collections SET recorded_to_year = %s WHERE id = %s", (item_max_year,collection_id)) + synced = True + + if synced: + self.stats['synced'] += 1 + + if i % 400 == 0: + self.step() + i += 1 + + + + + + diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py index 5d1d529..d576f82 100644 --- a/trunk/import/migration/tasks/items.py +++ b/trunk/import/migration/tasks/items.py @@ -36,7 +36,6 @@ from api import IDataMigrator from core import DataMigrator, EnumMapper from _mysql_exceptions import IntegrityError from MySQLdb.constants.ER import DUP_ENTRY -import re class ItemsCopyMigrator(DataMigrator): """Perform a preliminary raw copy of the item table""" @@ -117,111 +116,3 @@ class ItemsEnumMapper(EnumMapper): def process(self): EnumMapper.process(self, 'Phono', 'Cote_Phono', 'media_items', self.map) -class ItemsDateConverter(DataMigrator): - """Convert items recording dates""" - - implements(IDataMigrator) - - def get_name(self): - return "items:date" - - def process(self): - - names = { - 1: ur'jan(vier|v\.?|\.?)', - 2: ur'f[eé]v(rier|r\.?|\.?)', - 3: ur'mar(s|\.?)', - 4: ur'avr(il|\.?)', - 5: ur'mai', - 6: ur'juin', - 7: ur'juil(let|\.?)', - 8: ur'a[ôo][ûu]t', - 9: ur'sep(tembre|t\.?|\.?)', - 10: ur'oct(obre|\.?)', - 11: ur'd[ée]c(embre|\.?)', - 12: ur'nov(embre|\.?)' - } - - namepattern = u'|'.join([names[i] for i in names]) - - patterns = [ - # month only - ur'^ *(?P' + namepattern + ') *$', - - # day and month, no year - ur'^ *(?P[0-9]{1,2}) *(?P' + namepattern + ') *$', - - # from and until month - ur'^ *(?P' + namepattern + ') *[,-] *(?P' + namepattern + ') *$', - - # from year only - ur'^ *(été|) *(?P[0-9]{2,4})[ ?]*$', - - # from and until year - ur'^ *(?P[0-9]{4}) *[/,-] *(?P[0-9]{2,4}) *$', - - # month and year - ur'^ *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *$', - ur'^ *(?P' + namepattern + ') *\.? *(?P[0-9]{2,4}) *-' - ' *(?P' + namepattern + ') *\.? *(?P[0-9]{2,4}) *$', - ur'^ *(début|mi|fin|) *(?P' + namepattern + ') *(?P[0-9]{2,4})[ ?]*$', - - # from month, until month, and year - ur'^ *(?P[0-9]{1,2}) *[aà] *(?P[0-9]{1,2}) *- *(?P[0-9]{4}) *$', - ur'^ *(?P' + namepattern + ') *[,/-] *(?P' + namepattern + ') *[./]? *(?P[0-9]{2,4}) *$', - - # from month+year, until month+year - ur'^ *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *; *(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{2,4}) *$', - ur'^ *(?P[0-9]{1,2}) */ *(?P[0-9]{2,4}) *- *(?P[0-9]{1,2}) */ *(?P[0-9]{2,4}) *$', - - # day, month and year - ur'^ *(?P[0-9]{1,2}) *[_./-] *0?(?P[0-9]{1,2}) *[_./-] *(?P[0-9]{1,4}) *$', - ur'^ *(?P[0-9]{4}) *[_./-] *(?P[0-9]{2}) *[_./-] *(?P[0-9]{2}) *$', - ur'^ *(?P[0-9]{1,2}) *(?P' + namepattern + ') *(?P[0-9]{2,4})[? ]*$', - - # from+until day, single month and year - ur'^ *(?P[0-9]{1,2})[ &+,-]+(?P[0-9]{1,2}) *[ _./-] *(?P[0-9]{1,2}) *[ _./-] *(?P[0-9]{2,4}) *$', - ur'^ *(?P[0-9]{1,2}) *(-|/|et|au) *(?P[0-9]{1,2}) *(?P' + namepattern + ') *(?P[0-9]{2,4})[? ]*$', - - ] - - self.src_cursor.execute("SELECT COUNT(*) FROM Phono") - self.stats = { - 'total' : self.src_cursor.fetchone()[0], - 'matched' : 0, - 'empty' : 0, - 'unsignificant' : 0, - 'unparsed' : 0 - } - - self.src_cursor.execute("SELECT COUNT(*) FROM Phono WHERE Dates_Enregistr REGEXP '^ *$'") - self.stats['empty'] = self.src_cursor.fetchone()[0] - - self.src_cursor.execute("SELECT Dates_Enregistr FROM Phono WHERE Dates_Enregistr NOT REGEXP '^ *$'") - - print "WARNING: this migrator is a work in progress" - while True: - row = self.src_cursor.fetchone() - if not row: - break - - recognized = False - if re.match('^ *(nn?|=|id|idem|\?+|[-_]?1|[0 ]*) *$', row[0], re.IGNORECASE): - self.stats['unsignificant'] += 1 - recognized = True - else: - for p in patterns: - if re.match(p, row[0], re.IGNORECASE): - self.stats['matched'] += 1 - recognized = True - break - - if not recognized: - #print '|%s|' % row[0] - self.stats['unparsed'] += 1 - - - - - - -- 2.39.5