From: olivier Date: Fri, 2 May 2008 17:25:20 +0000 (+0000) Subject: add collection IDs analysis script X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=9cfa073fce758123b82e69422fe13c7dc25f5d01;p=telemeta-data.git add collection IDs analysis script git-svn-id: http://svn.parisson.org/svn/crem@13 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- diff --git a/docref/tools/analyse_collection_ids.py b/docref/tools/analyse_collection_ids.py new file mode 100644 index 0000000..9a08efc --- /dev/null +++ b/docref/tools/analyse_collection_ids.py @@ -0,0 +1,137 @@ +import MySQLdb +import _mysql_exceptions +import re +from sys import stdout + +# mysql> describe coll; +# +--------+--------------+------+-----+---------+-------+ +# | Field | Type | Null | Key | Default | Extra | +# +--------+--------------+------+-----+---------+-------+ +# | cote | varchar(255) | YES | UNI | NULL | | +# | format | varchar(255) | YES | | NULL | | +# | regex | varchar(255) | YES | | NULL | | +# | new_id | varchar(64) | YES | UNI | NULL | | +# +--------+--------------+------+-----+---------+-------+ + + +collection_patterns = [ + { 'format': 'BM.aaa.nnn.mmm', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'BM.aaaa.nnn.mmm/pp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaaa.nnn.mmm', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'BM.aaaa.nnn.mmm/ppp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, + { 'format': 'BM.aaaa.nnn.mm/pp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{2})/[0-9]{2}$'}, + { 'format': 'BM.aaaa.nnn', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})$'}, + { 'format': 'BM.aaa.nnn.mmm/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaa.nnn FANTOME', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3}) FANTOME$'}, + { 'format': 'BM.aaa.nnn', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'BM.aaa.nnnBISoo/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})BIS([0-9]{2})/[0-9]{2}$'}, + { 'format': 'BM.aaa.nnn.mmm.ppp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})\.[0-9]{3}$'}, + { 'format': 'BM.aaa.nnn.mmm/ppp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, + { 'format': 'BM.aaa.nnn/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaa.nnn-BIS.ooo/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})-BIS\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'BM.aaaa.nnn.mmm/NN', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/NN$'}, + { 'format': 'BM.aaa.nnn.mmm/pp-DEPOT', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}-DEPOT$'}, + { 'format': 'BM.aaa.nnn.mmm-o>p', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]>[0-9]$'}, + { 'format': 'CY.aaaa.nnn', 'regex': r'^(CY)\.([0-9]{4})\.([0-9]{3})$'}, + { 'format': 'DI.aaaa.nnn.mmm', 'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'DI.aaaa.nnn.mmm/pp', 'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'DI.aaa.nnn.mmm', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, + { 'format': 'DI.aaa.nnn.mmm/pp', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, + { 'format': 'DI.aaa.nnn.mmm-o/p', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]/[0-9]$'}, + { 'format': 'FANTOME 2*', 'regex': r'FANTOME 2\*$'}, +] + +db = MySQLdb.connect(user='root', db='test'); + +rcursor = db.cursor() +wcursor = db.cursor() + +wcursor.execute("UPDATE coll SET format='', new_id = NULL") + +nrow = 0 +for pattern in collection_patterns: + stdout.write('* format: ' + pattern['format'] + '\n') + wcursor.execute("UPDATE coll SET format=%s WHERE cote REGEXP %s", + (pattern['format'], pattern['regex'])) + rcursor.execute("SELECT COUNT(*) FROM coll") + row = rcursor.fetchone() + count = row[0] + rcursor.execute("SELECT cote FROM coll WHERE cote REGEXP %s", (pattern['regex'],)) + row = rcursor.fetchone() + while row: + if nrow % 200 == 0: + stdout.write(" row " + str(nrow) + "/" + str(count) + '\n') + id = row[0] + match = re.match(pattern['regex'], id) + + published = False + year = -1 + serial = -1 + physical = -1 + + if (match.lastindex >= 1): + published = (match.group(1) == 'DI') + if (match.lastindex >= 2): + year = int(match.group(2)) + if (match.lastindex >= 3): + serial = int(match.group(3)) + if (match.lastindex >= 4): + physical = int(match.group(4)) + + if (year == -1 or serial == -1): + stdout.write(' missing year or serial: ' + id + '\n') + else: + tokens = [] + if published: + tokens.append('CNRSMH_E') + else: + tokens.append('CNRSMH_I') + + if year < 1000: + if year < 100: + year += 2000 + else: + year += 1900 + tokens.append(str(year)) + + tokens.append(str(serial).rjust(3, '0')) + + if physical != -1: + tokens.append(str(physical).rjust(3, '0')) + else: + tokens.append('001') + + new_id = '_'.join(tokens) + + try: + wcursor.execute("UPDATE coll SET new_id = %s WHERE cote = %s", (new_id, id)) + except _mysql_exceptions.IntegrityError, (errno, errstr): + if errno == 1062: + stdout.write(' duplicate entry: ' + id + ' -> ' + new_id + '\n') + else: + raise + + row = rcursor.fetchone() + nrow += 1 + +rcursor.execute("SELECT format, COUNT(*) FROM coll GROUP BY format"); + +stdout.write("\nFORMAT STATS:\n") +row = rcursor.fetchone() +while row: + stdout.write(row[0] + ": " + str(row[1]) + '\n') + row = rcursor.fetchone() + +rcursor.execute("SELECT cote FROM coll WHERE new_id IS NULL") +stdout.write("\nUNCONVERTED IDs:\n") +row = rcursor.fetchone() +while row: + stdout.write(row[0] + '\n') + row = rcursor.fetchone() + + + + + + +