]> git.parisson.com Git - telemeta-data.git/commitdiff
add collection IDs analysis script
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Fri, 2 May 2008 17:25:20 +0000 (17:25 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Fri, 2 May 2008 17:25:20 +0000 (17:25 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@13 3bf09e05-f825-4182-b9bc-eedd7160adf0

docref/tools/analyse_collection_ids.py [new file with mode: 0644]

diff --git a/docref/tools/analyse_collection_ids.py b/docref/tools/analyse_collection_ids.py
new file mode 100644 (file)
index 0000000..9a08efc
--- /dev/null
@@ -0,0 +1,137 @@
+import MySQLdb
+import _mysql_exceptions
+import re
+from sys import stdout
+
+# mysql> describe coll;
+# +--------+--------------+------+-----+---------+-------+
+# | Field  | Type         | Null | Key | Default | Extra |
+# +--------+--------------+------+-----+---------+-------+
+# | cote   | varchar(255) | YES  | UNI | NULL    |       |
+# | format | varchar(255) | YES  |     | NULL    |       |
+# | regex  | varchar(255) | YES  |     | NULL    |       |
+# | new_id | varchar(64)  | YES  | UNI | NULL    |       |
+# +--------+--------------+------+-----+---------+-------+
+
+
+collection_patterns = [
+    { 'format': 'BM.aaa.nnn.mmm',           'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, 
+    { 'format': 'BM.aaaa.nnn.mmm/pp',       'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+    { 'format': 'BM.aaaa.nnn.mmm',          'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, 
+    { 'format': 'BM.aaaa.nnn.mmm/ppp',      'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, 
+    { 'format': 'BM.aaaa.nnn.mm/pp',        'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{2})/[0-9]{2}$'}, 
+    { 'format': 'BM.aaaa.nnn',              'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})$'}, 
+    { 'format': 'BM.aaa.nnn.mmm/pp',        'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+    { 'format': 'BM.aaa.nnn FANTOME',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3}) FANTOME$'}, 
+    { 'format': 'BM.aaa.nnn',               'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})$'}, 
+    { 'format': 'BM.aaa.nnnBISoo/pp',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})BIS([0-9]{2})/[0-9]{2}$'}, 
+    { 'format': 'BM.aaa.nnn.mmm.ppp',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})\.[0-9]{3}$'}, 
+    { 'format': 'BM.aaa.nnn.mmm/ppp',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{3}$'}, 
+    { 'format': 'BM.aaa.nnn/pp',            'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+    { 'format': 'BM.aaa.nnn-BIS.ooo/pp',    'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})-BIS\.([0-9]{3})/[0-9]{2}$'}, 
+    { 'format': 'BM.aaaa.nnn.mmm/NN',       'regex': r'^(BM)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/NN$'}, 
+    { 'format': 'BM.aaa.nnn.mmm/pp-DEPOT',  'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}-DEPOT$'}, 
+    { 'format': 'BM.aaa.nnn.mmm-o>p',       'regex': r'^(BM)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]>[0-9]$'}, 
+    { 'format': 'CY.aaaa.nnn',              'regex': r'^(CY)\.([0-9]{4})\.([0-9]{3})$'}, 
+    { 'format': 'DI.aaaa.nnn.mmm',          'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})$'}, 
+    { 'format': 'DI.aaaa.nnn.mmm/pp',       'regex': r'^(DI)\.([0-9]{4})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+    { 'format': 'DI.aaa.nnn.mmm',           'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})$'}, 
+    { 'format': 'DI.aaa.nnn.mmm/pp',        'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})/[0-9]{2}$'}, 
+    { 'format': 'DI.aaa.nnn.mmm-o/p',       'regex': r'^(DI)\.([0-9]{3})\.([0-9]{3})\.([0-9]{3})-[0-9]/[0-9]$'}, 
+    { 'format': 'FANTOME 2*',               'regex': r'FANTOME 2\*$'}, 
+]
+
+db = MySQLdb.connect(user='root', db='test');
+
+rcursor = db.cursor()
+wcursor = db.cursor()
+
+wcursor.execute("UPDATE coll SET format='', new_id = NULL")
+
+nrow = 0
+for pattern in collection_patterns:
+    stdout.write('* format: ' + pattern['format'] + '\n')
+    wcursor.execute("UPDATE coll SET format=%s WHERE cote REGEXP %s",
+        (pattern['format'], pattern['regex']))
+    rcursor.execute("SELECT COUNT(*) FROM coll")        
+    row = rcursor.fetchone()
+    count = row[0]
+    rcursor.execute("SELECT cote FROM coll WHERE cote REGEXP %s", (pattern['regex'],))    
+    row = rcursor.fetchone()
+    while row:
+        if nrow % 200 == 0:
+            stdout.write("  row " + str(nrow) + "/" + str(count) + '\n')
+        id = row[0]
+        match = re.match(pattern['regex'], id)
+
+        published = False
+        year = -1
+        serial = -1
+        physical = -1
+
+        if (match.lastindex >= 1):
+            published = (match.group(1) == 'DI')
+        if (match.lastindex >= 2):
+            year = int(match.group(2)) 
+        if (match.lastindex >= 3):
+            serial = int(match.group(3))
+        if (match.lastindex >= 4):
+            physical = int(match.group(4)) 
+        
+        if (year == -1 or serial == -1):
+            stdout.write('    missing year or serial: ' + id + '\n')
+        else:            
+            tokens = []
+            if published:
+                tokens.append('CNRSMH_E')
+            else:                
+                tokens.append('CNRSMH_I')
+            
+            if year < 1000:
+                if year < 100:
+                    year += 2000
+                else:
+                    year += 1900
+            tokens.append(str(year))
+                            
+            tokens.append(str(serial).rjust(3, '0'))
+
+            if physical != -1:
+                tokens.append(str(physical).rjust(3, '0'))
+            else:
+                tokens.append('001')
+            
+            new_id = '_'.join(tokens)
+
+            try:
+                wcursor.execute("UPDATE coll SET new_id = %s WHERE cote = %s", (new_id, id))           
+            except _mysql_exceptions.IntegrityError, (errno, errstr):
+                if errno == 1062:
+                    stdout.write('    duplicate entry: ' + id + ' -> ' + new_id + '\n')
+                else:
+                    raise
+
+        row = rcursor.fetchone()
+        nrow += 1      
+
+rcursor.execute("SELECT format, COUNT(*) FROM coll GROUP BY format");
+
+stdout.write("\nFORMAT STATS:\n")
+row = rcursor.fetchone()
+while row:
+    stdout.write(row[0] + ": " + str(row[1]) + '\n')
+    row = rcursor.fetchone()
+
+rcursor.execute("SELECT cote FROM coll WHERE new_id IS NULL")
+stdout.write("\nUNCONVERTED IDs:\n")
+row = rcursor.fetchone()
+while row:
+    stdout.write(row[0] + '\n')
+    row = rcursor.fetchone()
+    
+    
+
+    
+            
+
+