turn ethnic_groups into a simple enumeration, ensure ethnic group name uniqueness

author olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>

Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)

committer olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>

Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
author olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
committer olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
diff --git a/trunk/docref/crem.sql b/trunk/docref/crem.sql

index 4c4fcb13905daee5a88080f612ad3ea481938c3a..c07478d05841ed5fda9bc8a9d1b9872eec7fc0e7 100644 (file)
--- a/trunk/docref/crem.sql
+++ b/trunk/docref/crem.sql
@@ -170,13 +170,14 @@ CREATE TABLE location_relations (
  
  CREATE TABLE ethnic_groups (
      id      INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, 
-    name    VARCHAR(250) NOT NULL
+    value   VARCHAR(250) NOT NULL UNIQUE
  ) CHARACTER SET='utf8' ENGINE=InnoDB;
  
  CREATE TABLE ethnic_group_aliases (
      ethnic_group_id INTEGER NOT NULL, 
-    name            VARCHAR(250) NOT NULL,
+    value           VARCHAR(250) NOT NULL,
  
+    UNIQUE(ethnic_group_id, value),
      FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id) ON DELETE CASCADE
  ) CHARACTER SET='utf8' ENGINE=InnoDB;
  
diff --git a/trunk/docref/docref.odt b/trunk/docref/docref.odt

index 1373847645371a11e83f5080851315798d7ef347..76207897ffdd7c0bca2dc84104f4db422a3f0619 100644 (file)

Binary files a/trunk/docref/docref.odt and b/trunk/docref/docref.odt differ
diff --git a/trunk/import/migration/tasks/core.py b/trunk/import/migration/tasks/core.py

index c9acafa7d7013e02b6d75b912123ec0c4f3246cd..fcf8693cfa9b9be5eae60ef03f7d07ff6ff9e54f 100644 (file)
--- a/trunk/import/migration/tasks/core.py
+++ b/trunk/import/migration/tasks/core.py
@@ -39,6 +39,8 @@ import time
  import warnings
  import _mysql_exceptions
  import subprocess
+from unaccent import unaccent_icmp
+
  
  class DataMigrationTask(Component):
      
@@ -122,14 +124,18 @@ class GroupedItemsManager(object):
  
      def __init__(self):
          self.groups = {}
-    
+   
+    def find_group(self, group):
+        for g in self.groups:
+            if not unaccent_icmp(g, group):
+                return g
+        return None
+
      def append_group(self, group):
          group = group.strip()
          if not len(group):
              return
-        try:
-            self.groups[group]
-        except KeyError:
+        if not self.find_group(group):
              self.groups[group] = []
  
      def append_item(self, group, item, detect_group=False):
@@ -141,19 +147,15 @@ class GroupedItemsManager(object):
          g = None
          
          if detect_group:
-            try:
-                self.groups[group]
-                g = group
+            g = self.find_group(group)
+            if g:
                  i = item
-            except KeyError:
-                try:
-                    self.groups[item]
-                    g = item
+            else:
+                g = self.find_group(item)
+                if g:
                      i = group
-                except KeyError:
-                    pass
          else:
-            g = group
+            g = self.find_group(group)
              i = item
  
          if g:
diff --git a/trunk/import/migration/tasks/ethnic.py b/trunk/import/migration/tasks/ethnic.py

index 01f3d4002c7bca11170342756e6900db49715bbd..f0db6302e2b1cffb80bdcf0e011aa913b380da6f 100644 (file)
--- a/trunk/import/migration/tasks/ethnic.py
+++ b/trunk/import/migration/tasks/ethnic.py
@@ -72,11 +72,13 @@ class EthnicGroupsMigrator(DataMigrator):
              self.data.append_item(row[0], row[1], detect_group=True)
  
      def insert(self):
+        self.target("DELETE FROM ethnic_groups")
+        self.target("DELETE FROM ethnic_group_aliases")
          for group in self.data.groups:
-            self.target_cursor.execute("INSERT INTO ethnic_groups (name) VALUES(%s)", (group,))
+            self.target_cursor.execute("INSERT INTO ethnic_groups (value) VALUES(%s)", (group,))
              id = self.target_db.insert_id()
              for alias in self.data.groups[group]:
-                self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, name) "+
+                self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, value) "+
                                             "VALUES(%s, %s)", (id, alias))
  
      def process(self):
diff --git a/trunk/import/migration/tasks/geoethno.py b/trunk/import/migration/tasks/geoethno.py

index 5abc8ca5c33939fa92947f61e2f551f3cf1ece9e..5b26a61b0fd74e37c842802251781296f73d755f 100644 (file)
--- a/trunk/import/migration/tasks/geoethno.py
+++ b/trunk/import/migration/tasks/geoethno.py
@@ -216,7 +216,7 @@ class GeoEthnoImporter(DataMigrator):
          self.end()
  
  class GeoEthnoAncestryBuilder(DataMigrator):
-    """Update indirect location ancestry relations"""
+    """Build indirect location ancestry relations"""
  
      implements(IDataMigrator)
  
diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py

index dc6a751aa7843d93646b826b8f2e6ba61fc0f320..1b20870c686872370ec97ef3c183f9170d2b17e4 100644 (file)
--- a/trunk/import/migration/tasks/items.py
+++ b/trunk/import/migration/tasks/items.py
@@ -111,7 +111,7 @@ class ItemsEnumMapper(EnumMapper):
      implements(IDataMigrator)
  
      map = [
-        ('Ethnie_GrSocial',  'ethnic_group:name'),
+        ('Ethnie_GrSocial',  'ethnic_group'),
          ('Form_Genr_Style',  'vernacular_style'),
          ('FormStyl generi',  'generic_style')
      ]
diff --git a/trunk/import/migration/tasks/unaccent.py b/trunk/import/migration/tasks/unaccent.py

new file mode 100644 (file)

index 0000000..7c5757d
--- /dev/null
+++ b/trunk/import/migration/tasks/unaccent.py
@@ -0,0 +1,71 @@
+# This file by Fredrik Lundh from:
+# http://effbot.org/zone/unicode-convert.htm
+# http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py
+
+# use a dynamically populated translation dictionary to remove accents
+# from a string
+
+import unicodedata, sys
+
+CHAR_REPLACEMENT = {
+    # latin-1 characters that don't have a unicode decomposition
+    0xc6: u"AE", # LATIN CAPITAL LETTER AE
+    0xd0: u"D",  # LATIN CAPITAL LETTER ETH
+    0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
+    0xde: u"Th", # LATIN CAPITAL LETTER THORN
+    0xdf: u"ss", # LATIN SMALL LETTER SHARP S
+    0xe6: u"ae", # LATIN SMALL LETTER AE
+    0xf0: u"d",  # LATIN SMALL LETTER ETH
+    0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
+    0xfe: u"th", # LATIN SMALL LETTER THORN
+    }
+
+##
+# Translation dictionary.  Translation entries are added to this
+# dictionary as needed.
+
+class UnaccentedMap(dict):
+
+    ##
+    # Maps a unicode character code (the key) to a replacement code
+    # (either a character code or a unicode string).
+
+    def mapchar(self, key):
+        ch = self.get(key)
+        if ch is not None:
+            return ch
+        de = unicodedata.decomposition(unichr(key))
+        if de:
+            try:
+                ch = int(de.split(None, 1)[0], 16)
+            except (IndexError, ValueError):
+                ch = key
+        else:
+            ch = CHAR_REPLACEMENT.get(key, key)
+        self[key] = ch
+        return ch
+
+    if sys.version >= "2.5":
+        # use __missing__ where available
+        __missing__ = mapchar
+    else:
+        # otherwise, use standard __getitem__ hook (this is slower,
+        # since it's called for each character)
+        __getitem__ = mapchar
+
+
+_map = UnaccentedMap()
+
+def unaccent(str):
+    return str.translate(_map)
+
+def unaccent_icmp(str1, str2):
+    str1 = unaccent(str1).lower()
+    str2 = unaccent(str2).lower()
+    if str1 > str2:
+        return 1
+
+    if str1 < str2:
+        return -1
+
+    return 0
author	olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
	Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
committer	olivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
	Mon, 15 Feb 2010 23:01:33 +0000 (23:01 +0000)
trunk/docref/crem.sql		patch \| blob \| history
trunk/docref/docref.odt		patch \| blob \| history
trunk/import/migration/tasks/core.py		patch \| blob \| history
trunk/import/migration/tasks/ethnic.py		patch \| blob \| history
trunk/import/migration/tasks/geoethno.py		patch \| blob \| history
trunk/import/migration/tasks/items.py		patch \| blob \| history
trunk/import/migration/tasks/unaccent.py	[new file with mode: 0644]	patch \| blob