--- /dev/null
+import MySQLdb
+import _mysql_exceptions
+import re
+from sys import stdout
+
+# mysql> describe coll;
+# +--------+--------------+------+-----+---------+-------+
+# | Field | Type | Null | Key | Default | Extra |
+# +--------+--------------+------+-----+---------+-------+
+# | cote | varchar(255) | YES | UNI | NULL | |
+# | format | varchar(255) | YES | | NULL | |
+# | regex | varchar(255) | YES | | NULL | |
+# | new_id | varchar(64) | YES | UNI | NULL | |
+# +--------+--------------+------+-----+---------+-------+
+
+
+collection_patterns = [
+ { 'format': 'BM.aaa.nnn.mmm', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'},
+ { 'format': 'BM.aaaa.nnn.mmm/pp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'},
+ { 'format': 'BM.aaaa.nnn.mmm', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'},
+ { 'format': 'BM.aaaa.nnn.mmm/ppp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'},
+ { 'format': 'BM.aaaa.nnn.mm/pp', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{2})/[0-9]{2}$'},
+ { 'format': 'BM.aaaa.nnn', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})$'},
+ { 'format': 'BM.aaa.nnn.mmm/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'},
+ { 'format': 'BM.aaa.nnn FANTOME', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3}) FANTOME$'},
+ { 'format': 'BM.aaa.nnn', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})$'},
+ { 'format': 'BM.aaa.nnnBISoo/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})BIS([0-9]{2})/[0-9]{2}$'},
+ { 'format': 'BM.aaa.nnn.mmm.ppp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})\.[0-9]{3}$'},
+ { 'format': 'BM.aaa.nnn.mmm/ppp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'},
+ { 'format': 'BM.aaa.nnn/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'},
+ { 'format': 'BM.aaa.nnn-BIS.ooo/pp', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})-BIS\.([0-9]{3})/[0-9]{2}$'},
+ { 'format': 'BM.aaaa.nnn.mmm/NN', 'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/NN$'},
+ { 'format': 'BM.aaa.nnn.mmm/pp-DEPOT', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}-DEPOT$'},
+ { 'format': 'BM.aaa.nnn.mmm-o>p', 'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]>[0-9]$'},
+ { 'format': 'CY.aaaa.nnn', 'regex': r'^(CY)\.([0-9]{4})\.([0-9]{3})$'},
+ { 'format': 'DI.aaaa.nnn.mmm', 'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'},
+ { 'format': 'DI.aaaa.nnn.mmm/pp', 'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'},
+ { 'format': 'DI.aaa.nnn.mmm', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'},
+ { 'format': 'DI.aaa.nnn.mmm/pp', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'},
+ { 'format': 'DI.aaa.nnn.mmm-o/p', 'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]/[0-9]$'},
+ { 'format': 'FANTOME 2*', 'regex': r'FANTOME 2\*$'},
+]
+
+db = MySQLdb.connect(user='root', db='test');
+
+rcursor = db.cursor()
+wcursor = db.cursor()
+
+wcursor.execute("UPDATE coll SET format='', new_id = NULL")
+
+nrow = 0
+for pattern in collection_patterns:
+ stdout.write('* format: ' + pattern['format'] + '\n')
+ wcursor.execute("UPDATE coll SET format=%s WHERE cote REGEXP %s",
+ (pattern['format'], pattern['regex']))
+ rcursor.execute("SELECT COUNT(*) FROM coll")
+ row = rcursor.fetchone()
+ count = row[0]
+ rcursor.execute("SELECT cote FROM coll WHERE cote REGEXP %s", (pattern['regex'],))
+ row = rcursor.fetchone()
+ while row:
+ if nrow % 200 == 0:
+ stdout.write(" row " + str(nrow) + "/" + str(count) + '\n')
+ id = row[0]
+ match = re.match(pattern['regex'], id)
+
+ published = False
+ year = -1
+ serial = -1
+ physical = -1
+
+ if (match.lastindex >= 1):
+ published = (match.group(1) == 'DI')
+ if (match.lastindex >= 2):
+ year = int(match.group(2))
+ if (match.lastindex >= 3):
+ serial = int(match.group(3))
+ if (match.lastindex >= 4):
+ physical = int(match.group(4))
+
+ if (year == -1 or serial == -1):
+ stdout.write(' missing year or serial: ' + id + '\n')
+ else:
+ tokens = []
+ if published:
+ tokens.append('CNRSMH_E')
+ else:
+ tokens.append('CNRSMH_I')
+
+ if year < 1000:
+ if year < 100:
+ year += 2000
+ else:
+ year += 1900
+ tokens.append(str(year))
+
+ tokens.append(str(serial).rjust(3, '0'))
+
+ if physical != -1:
+ tokens.append(str(physical).rjust(3, '0'))
+ else:
+ tokens.append('001')
+
+ new_id = '_'.join(tokens)
+
+ try:
+ wcursor.execute("UPDATE coll SET new_id = %s WHERE cote = %s", (new_id, id))
+ except _mysql_exceptions.IntegrityError, (errno, errstr):
+ if errno == 1062:
+ stdout.write(' duplicate entry: ' + id + ' -> ' + new_id + '\n')
+ else:
+ raise
+
+ row = rcursor.fetchone()
+ nrow += 1
+
+rcursor.execute("SELECT format, COUNT(*) FROM coll GROUP BY format");
+
+stdout.write("\nFORMAT STATS:\n")
+row = rcursor.fetchone()
+while row:
+ stdout.write(row[0] + ": " + str(row[1]) + '\n')
+ row = rcursor.fetchone()
+
+rcursor.execute("SELECT cote FROM coll WHERE new_id IS NULL")
+stdout.write("\nUNCONVERTED IDs:\n")
+row = rcursor.fetchone()
+while row:
+ stdout.write(row[0] + '\n')
+ row = rcursor.fetchone()
+
+
+
+
+
+
+