]> git.parisson.com Git - telemeta-data.git/commitdiff
turn ethnic_groups into a simple enumeration, ensure ethnic group name uniqueness
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@158 3bf09e05-f825-4182-b9bc-eedd7160adf0

trunk/docref/crem.sql
trunk/docref/docref.odt
trunk/import/migration/tasks/core.py
trunk/import/migration/tasks/ethnic.py
trunk/import/migration/tasks/geoethno.py
trunk/import/migration/tasks/items.py
trunk/import/migration/tasks/unaccent.py [new file with mode: 0644]

index 4c4fcb13905daee5a88080f612ad3ea481938c3a..c07478d05841ed5fda9bc8a9d1b9872eec7fc0e7 100644 (file)
@@ -170,13 +170,14 @@ CREATE TABLE location_relations (
 
 CREATE TABLE ethnic_groups (
     id      INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, 
-    name    VARCHAR(250) NOT NULL
+    value   VARCHAR(250) NOT NULL UNIQUE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
 CREATE TABLE ethnic_group_aliases (
     ethnic_group_id INTEGER NOT NULL, 
-    name            VARCHAR(250) NOT NULL,
+    value           VARCHAR(250) NOT NULL,
 
+    UNIQUE(ethnic_group_id, value),
     FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id) ON DELETE CASCADE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
index 1373847645371a11e83f5080851315798d7ef347..76207897ffdd7c0bca2dc84104f4db422a3f0619 100644 (file)
Binary files a/trunk/docref/docref.odt and b/trunk/docref/docref.odt differ
index c9acafa7d7013e02b6d75b912123ec0c4f3246cd..fcf8693cfa9b9be5eae60ef03f7d07ff6ff9e54f 100644 (file)
@@ -39,6 +39,8 @@ import time
 import warnings
 import _mysql_exceptions
 import subprocess
+from unaccent import unaccent_icmp
+
 
 class DataMigrationTask(Component):
     
@@ -122,14 +124,18 @@ class GroupedItemsManager(object):
 
     def __init__(self):
         self.groups = {}
-    
+   
+    def find_group(self, group):
+        for g in self.groups:
+            if not unaccent_icmp(g, group):
+                return g
+        return None
+
     def append_group(self, group):
         group = group.strip()
         if not len(group):
             return
-        try:
-            self.groups[group]
-        except KeyError:
+        if not self.find_group(group):
             self.groups[group] = []
 
     def append_item(self, group, item, detect_group=False):
@@ -141,19 +147,15 @@ class GroupedItemsManager(object):
         g = None
         
         if detect_group:
-            try:
-                self.groups[group]
-                g = group
+            g = self.find_group(group)
+            if g:
                 i = item
-            except KeyError:
-                try:
-                    self.groups[item]
-                    g = item
+            else:
+                g = self.find_group(item)
+                if g:
                     i = group
-                except KeyError:
-                    pass
         else:
-            g = group
+            g = self.find_group(group)
             i = item
 
         if g:
index 01f3d4002c7bca11170342756e6900db49715bbd..f0db6302e2b1cffb80bdcf0e011aa913b380da6f 100644 (file)
@@ -72,11 +72,13 @@ class EthnicGroupsMigrator(DataMigrator):
             self.data.append_item(row[0], row[1], detect_group=True)
 
     def insert(self):
+        self.target("DELETE FROM ethnic_groups")
+        self.target("DELETE FROM ethnic_group_aliases")
         for group in self.data.groups:
-            self.target_cursor.execute("INSERT INTO ethnic_groups (name) VALUES(%s)", (group,))
+            self.target_cursor.execute("INSERT INTO ethnic_groups (value) VALUES(%s)", (group,))
             id = self.target_db.insert_id()
             for alias in self.data.groups[group]:
-                self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, name) "+
+                self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, value) "+
                                            "VALUES(%s, %s)", (id, alias))
 
     def process(self):
index 5abc8ca5c33939fa92947f61e2f551f3cf1ece9e..5b26a61b0fd74e37c842802251781296f73d755f 100644 (file)
@@ -216,7 +216,7 @@ class GeoEthnoImporter(DataMigrator):
         self.end()
 
 class GeoEthnoAncestryBuilder(DataMigrator):
-    """Update indirect location ancestry relations"""
+    """Build indirect location ancestry relations"""
 
     implements(IDataMigrator)
 
index dc6a751aa7843d93646b826b8f2e6ba61fc0f320..1b20870c686872370ec97ef3c183f9170d2b17e4 100644 (file)
@@ -111,7 +111,7 @@ class ItemsEnumMapper(EnumMapper):
     implements(IDataMigrator)
 
     map = [
-        ('Ethnie_GrSocial',  'ethnic_group:name'),
+        ('Ethnie_GrSocial',  'ethnic_group'),
         ('Form_Genr_Style',  'vernacular_style'),
         ('FormStyl generi',  'generic_style')
     ]
diff --git a/trunk/import/migration/tasks/unaccent.py b/trunk/import/migration/tasks/unaccent.py
new file mode 100644 (file)
index 0000000..7c5757d
--- /dev/null
@@ -0,0 +1,71 @@
+# This file by Fredrik Lundh from:
+# http://effbot.org/zone/unicode-convert.htm
+# http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py
+
+# use a dynamically populated translation dictionary to remove accents
+# from a string
+
+import unicodedata, sys
+
+CHAR_REPLACEMENT = {
+    # latin-1 characters that don't have a unicode decomposition
+    0xc6: u"AE", # LATIN CAPITAL LETTER AE
+    0xd0: u"D",  # LATIN CAPITAL LETTER ETH
+    0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
+    0xde: u"Th", # LATIN CAPITAL LETTER THORN
+    0xdf: u"ss", # LATIN SMALL LETTER SHARP S
+    0xe6: u"ae", # LATIN SMALL LETTER AE
+    0xf0: u"d",  # LATIN SMALL LETTER ETH
+    0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
+    0xfe: u"th", # LATIN SMALL LETTER THORN
+    }
+
+##
+# Translation dictionary.  Translation entries are added to this
+# dictionary as needed.
+
+class UnaccentedMap(dict):
+
+    ##
+    # Maps a unicode character code (the key) to a replacement code
+    # (either a character code or a unicode string).
+
+    def mapchar(self, key):
+        ch = self.get(key)
+        if ch is not None:
+            return ch
+        de = unicodedata.decomposition(unichr(key))
+        if de:
+            try:
+                ch = int(de.split(None, 1)[0], 16)
+            except (IndexError, ValueError):
+                ch = key
+        else:
+            ch = CHAR_REPLACEMENT.get(key, key)
+        self[key] = ch
+        return ch
+
+    if sys.version >= "2.5":
+        # use __missing__ where available
+        __missing__ = mapchar
+    else:
+        # otherwise, use standard __getitem__ hook (this is slower,
+        # since it's called for each character)
+        __getitem__ = mapchar
+
+
+_map = UnaccentedMap()
+
+def unaccent(str):
+    return str.translate(_map)
+
+def unaccent_icmp(str1, str2):
+    str1 = unaccent(str1).lower()
+    str2 = unaccent(str2).lower()
+    if str1 > str2:
+        return 1
+
+    if str1 < str2:
+        return -1
+
+    return 0