CREATE TABLE ethnic_groups (
id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY,
- name VARCHAR(250) NOT NULL
+ value VARCHAR(250) NOT NULL UNIQUE
) CHARACTER SET='utf8' ENGINE=InnoDB;
CREATE TABLE ethnic_group_aliases (
ethnic_group_id INTEGER NOT NULL,
- name VARCHAR(250) NOT NULL,
+ value VARCHAR(250) NOT NULL,
+ UNIQUE(ethnic_group_id, value),
FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id) ON DELETE CASCADE
) CHARACTER SET='utf8' ENGINE=InnoDB;
import warnings
import _mysql_exceptions
import subprocess
+from unaccent import unaccent_icmp
+
class DataMigrationTask(Component):
def __init__(self):
self.groups = {}
-
+
+ def find_group(self, group):
+ for g in self.groups:
+ if not unaccent_icmp(g, group):
+ return g
+ return None
+
def append_group(self, group):
group = group.strip()
if not len(group):
return
- try:
- self.groups[group]
- except KeyError:
+ if not self.find_group(group):
self.groups[group] = []
def append_item(self, group, item, detect_group=False):
g = None
if detect_group:
- try:
- self.groups[group]
- g = group
+ g = self.find_group(group)
+ if g:
i = item
- except KeyError:
- try:
- self.groups[item]
- g = item
+ else:
+ g = self.find_group(item)
+ if g:
i = group
- except KeyError:
- pass
else:
- g = group
+ g = self.find_group(group)
i = item
if g:
self.data.append_item(row[0], row[1], detect_group=True)
def insert(self):
+ self.target("DELETE FROM ethnic_groups")
+ self.target("DELETE FROM ethnic_group_aliases")
for group in self.data.groups:
- self.target_cursor.execute("INSERT INTO ethnic_groups (name) VALUES(%s)", (group,))
+ self.target_cursor.execute("INSERT INTO ethnic_groups (value) VALUES(%s)", (group,))
id = self.target_db.insert_id()
for alias in self.data.groups[group]:
- self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, name) "+
+ self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, value) "+
"VALUES(%s, %s)", (id, alias))
def process(self):
self.end()
class GeoEthnoAncestryBuilder(DataMigrator):
- """Update indirect location ancestry relations"""
+ """Build indirect location ancestry relations"""
implements(IDataMigrator)
implements(IDataMigrator)
map = [
- ('Ethnie_GrSocial', 'ethnic_group:name'),
+ ('Ethnie_GrSocial', 'ethnic_group'),
('Form_Genr_Style', 'vernacular_style'),
('FormStyl generi', 'generic_style')
]
--- /dev/null
+# This file by Fredrik Lundh from:
+# http://effbot.org/zone/unicode-convert.htm
+# http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py
+
+# use a dynamically populated translation dictionary to remove accents
+# from a string
+
+import unicodedata, sys
+
+CHAR_REPLACEMENT = {
+ # latin-1 characters that don't have a unicode decomposition
+ 0xc6: u"AE", # LATIN CAPITAL LETTER AE
+ 0xd0: u"D", # LATIN CAPITAL LETTER ETH
+ 0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE
+ 0xde: u"Th", # LATIN CAPITAL LETTER THORN
+ 0xdf: u"ss", # LATIN SMALL LETTER SHARP S
+ 0xe6: u"ae", # LATIN SMALL LETTER AE
+ 0xf0: u"d", # LATIN SMALL LETTER ETH
+ 0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE
+ 0xfe: u"th", # LATIN SMALL LETTER THORN
+ }
+
+##
+# Translation dictionary. Translation entries are added to this
+# dictionary as needed.
+
+class UnaccentedMap(dict):
+
+ ##
+ # Maps a unicode character code (the key) to a replacement code
+ # (either a character code or a unicode string).
+
+ def mapchar(self, key):
+ ch = self.get(key)
+ if ch is not None:
+ return ch
+ de = unicodedata.decomposition(unichr(key))
+ if de:
+ try:
+ ch = int(de.split(None, 1)[0], 16)
+ except (IndexError, ValueError):
+ ch = key
+ else:
+ ch = CHAR_REPLACEMENT.get(key, key)
+ self[key] = ch
+ return ch
+
+ if sys.version >= "2.5":
+ # use __missing__ where available
+ __missing__ = mapchar
+ else:
+ # otherwise, use standard __getitem__ hook (this is slower,
+ # since it's called for each character)
+ __getitem__ = mapchar
+
+
+_map = UnaccentedMap()
+
+def unaccent(str):
+ return str.translate(_map)
+
+def unaccent_icmp(str1, str2):
+ str1 = unaccent(str1).lower()
+ str2 = unaccent(str2).lower()
+ if str1 > str2:
+ return 1
+
+ if str1 < str2:
+ return -1
+
+ return 0