From: olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Date: Mon, 15 Feb 2010 23:01:33 +0000 (+0000)
Subject: turn ethnic_groups into a simple enumeration, ensure ethnic group name uniqueness
X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=90d096eeb75ef1695b990a10833a0c72a2aa953f;p=telemeta-data.git

turn ethnic_groups into a simple enumeration, ensure ethnic group name uniqueness

git-svn-id: http://svn.parisson.org/svn/crem@158 3bf09e05-f825-4182-b9bc-eedd7160adf0
---

diff --git a/trunk/docref/crem.sql b/trunk/docref/crem.sql
index 4c4fcb1..c07478d 100644
--- a/trunk/docref/crem.sql
+++ b/trunk/docref/crem.sql
@@ -170,13 +170,14 @@ CREATE TABLE location_relations (
 
 CREATE TABLE ethnic_groups (
     id      INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, 
-    name    VARCHAR(250) NOT NULL
+    value   VARCHAR(250) NOT NULL UNIQUE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
 CREATE TABLE ethnic_group_aliases (
     ethnic_group_id INTEGER NOT NULL, 
-    name            VARCHAR(250) NOT NULL,
+    value           VARCHAR(250) NOT NULL,
 
+    UNIQUE(ethnic_group_id, value),
     FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id) ON DELETE CASCADE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
diff --git a/trunk/docref/docref.odt b/trunk/docref/docref.odt
index 1373847..7620789 100644
Binary files a/trunk/docref/docref.odt and b/trunk/docref/docref.odt differ
diff --git a/trunk/import/migration/tasks/core.py b/trunk/import/migration/tasks/core.py
index c9acafa..fcf8693 100644
--- a/trunk/import/migration/tasks/core.py
+++ b/trunk/import/migration/tasks/core.py
@@ -39,6 +39,8 @@ import time
 import warnings
 import _mysql_exceptions
 import subprocess
+from unaccent import unaccent_icmp
+
 
 class DataMigrationTask(Component):
     
@@ -122,14 +124,18 @@ class GroupedItemsManager(object):
 
     def __init__(self):
         self.groups = {}
-    
+   
+    def find_group(self, group):
+        for g in self.groups:
+            if not unaccent_icmp(g, group):
+                return g
+        return None
+
     def append_group(self, group):
         group = group.strip()
         if not len(group):
             return
-        try:
-            self.groups[group]
-        except KeyError:
+        if not self.find_group(group):
             self.groups[group] = []
 
     def append_item(self, group, item, detect_group=False):
@@ -141,19 +147,15 @@ class GroupedItemsManager(object):
         g = None
         
         if detect_group:
-            try:
-                self.groups[group]
-                g = group
+            g = self.find_group(group)
+            if g:
                 i = item
-            except KeyError:
-                try:
-                    self.groups[item]
-                    g = item
+            else:
+                g = self.find_group(item)
+                if g:
                     i = group
-                except KeyError:
-                    pass
         else:
-            g = group
+            g = self.find_group(group)
             i = item
 
         if g:
diff --git a/trunk/import/migration/tasks/ethnic.py b/trunk/import/migration/tasks/ethnic.py
index 01f3d40..f0db630 100644
--- a/trunk/import/migration/tasks/ethnic.py
+++ b/trunk/import/migration/tasks/ethnic.py
@@ -72,11 +72,13 @@ class EthnicGroupsMigrator(DataMigrator):
             self.data.append_item(row[0], row[1], detect_group=True)
 
     def insert(self):
+        self.target("DELETE FROM ethnic_groups")
+        self.target("DELETE FROM ethnic_group_aliases")
         for group in self.data.groups:
-            self.target_cursor.execute("INSERT INTO ethnic_groups (name) VALUES(%s)", (group,))
+            self.target_cursor.execute("INSERT INTO ethnic_groups (value) VALUES(%s)", (group,))
             id = self.target_db.insert_id()
             for alias in self.data.groups[group]:
-                self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, name) "+
+                self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, value) "+
                                            "VALUES(%s, %s)", (id, alias))
 
     def process(self):
diff --git a/trunk/import/migration/tasks/geoethno.py b/trunk/import/migration/tasks/geoethno.py
index 5abc8ca..5b26a61 100644
--- a/trunk/import/migration/tasks/geoethno.py
+++ b/trunk/import/migration/tasks/geoethno.py
@@ -216,7 +216,7 @@ class GeoEthnoImporter(DataMigrator):
         self.end()
 
 class GeoEthnoAncestryBuilder(DataMigrator):
-    """Update indirect location ancestry relations"""
+    """Build indirect location ancestry relations"""
 
     implements(IDataMigrator)
 
diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py
index dc6a751..1b20870 100644
--- a/trunk/import/migration/tasks/items.py
+++ b/trunk/import/migration/tasks/items.py
@@ -111,7 +111,7 @@ class ItemsEnumMapper(EnumMapper):
     implements(IDataMigrator)
 
     map = [
-        ('Ethnie_GrSocial',  'ethnic_group:name'),
+        ('Ethnie_GrSocial',  'ethnic_group'),
         ('Form_Genr_Style',  'vernacular_style'),
         ('FormStyl generi',  'generic_style')
     ]
diff --git a/trunk/import/migration/tasks/unaccent.py b/trunk/import/migration/tasks/unaccent.py
new file mode 100644
index 0000000..7c5757d
--- /dev/null
+++ b/trunk/import/migration/tasks/unaccent.py
@@ -0,0 +1,71 @@
+# This file by Fredrik Lundh from:
+# http://effbot.org/zone/unicode-convert.htm
+# http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py
+
+# use a dynamically populated translation dictionary to remove accents
+# from a string
+
+import unicodedata, sys
+
+CHAR_REPLACEMENT = {
+    # latin-1 characters that don't have a unicode decomposition
+    0xc6: u"AE", # LATIN CAPITAL LETTER AE
+    0xd0: u"D",  # LATIN CAPITAL LETTER ETH
+    0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
+    0xde: u"Th", # LATIN CAPITAL LETTER THORN
+    0xdf: u"ss", # LATIN SMALL LETTER SHARP S
+    0xe6: u"ae", # LATIN SMALL LETTER AE
+    0xf0: u"d",  # LATIN SMALL LETTER ETH
+    0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
+    0xfe: u"th", # LATIN SMALL LETTER THORN
+    }
+
+##
+# Translation dictionary.  Translation entries are added to this
+# dictionary as needed.
+
+class UnaccentedMap(dict):
+
+    ##
+    # Maps a unicode character code (the key) to a replacement code
+    # (either a character code or a unicode string).
+
+    def mapchar(self, key):
+        ch = self.get(key)
+        if ch is not None:
+            return ch
+        de = unicodedata.decomposition(unichr(key))
+        if de:
+            try:
+                ch = int(de.split(None, 1)[0], 16)
+            except (IndexError, ValueError):
+                ch = key
+        else:
+            ch = CHAR_REPLACEMENT.get(key, key)
+        self[key] = ch
+        return ch
+
+    if sys.version >= "2.5":
+        # use __missing__ where available
+        __missing__ = mapchar
+    else:
+        # otherwise, use standard __getitem__ hook (this is slower,
+        # since it's called for each character)
+        __getitem__ = mapchar
+
+
+_map = UnaccentedMap()
+
+def unaccent(str):
+    return str.translate(_map)
+
+def unaccent_icmp(str1, str2):
+    str1 = unaccent(str1).lower()
+    str2 = unaccent(str2).lower()
+    if str1 > str2:
+        return 1
+
+    if str1 < str2:
+        return -1
+
+    return 0