From: olivier Date: Mon, 15 Feb 2010 23:01:33 +0000 (+0000) Subject: turn ethnic_groups into a simple enumeration, ensure ethnic group name uniqueness X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=90d096eeb75ef1695b990a10833a0c72a2aa953f;p=telemeta-data.git turn ethnic_groups into a simple enumeration, ensure ethnic group name uniqueness git-svn-id: http://svn.parisson.org/svn/crem@158 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- diff --git a/trunk/docref/crem.sql b/trunk/docref/crem.sql index 4c4fcb1..c07478d 100644 --- a/trunk/docref/crem.sql +++ b/trunk/docref/crem.sql @@ -170,13 +170,14 @@ CREATE TABLE location_relations ( CREATE TABLE ethnic_groups ( id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(250) NOT NULL + value VARCHAR(250) NOT NULL UNIQUE ) CHARACTER SET='utf8' ENGINE=InnoDB; CREATE TABLE ethnic_group_aliases ( ethnic_group_id INTEGER NOT NULL, - name VARCHAR(250) NOT NULL, + value VARCHAR(250) NOT NULL, + UNIQUE(ethnic_group_id, value), FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id) ON DELETE CASCADE ) CHARACTER SET='utf8' ENGINE=InnoDB; diff --git a/trunk/docref/docref.odt b/trunk/docref/docref.odt index 1373847..7620789 100644 Binary files a/trunk/docref/docref.odt and b/trunk/docref/docref.odt differ diff --git a/trunk/import/migration/tasks/core.py b/trunk/import/migration/tasks/core.py index c9acafa..fcf8693 100644 --- a/trunk/import/migration/tasks/core.py +++ b/trunk/import/migration/tasks/core.py @@ -39,6 +39,8 @@ import time import warnings import _mysql_exceptions import subprocess +from unaccent import unaccent_icmp + class DataMigrationTask(Component): @@ -122,14 +124,18 @@ class GroupedItemsManager(object): def __init__(self): self.groups = {} - + + def find_group(self, group): + for g in self.groups: + if not unaccent_icmp(g, group): + return g + return None + def append_group(self, group): group = group.strip() if not len(group): return - try: - self.groups[group] - except KeyError: + if not self.find_group(group): self.groups[group] = [] def append_item(self, group, item, detect_group=False): @@ -141,19 +147,15 @@ class GroupedItemsManager(object): g = None if detect_group: - try: - self.groups[group] - g = group + g = self.find_group(group) + if g: i = item - except KeyError: - try: - self.groups[item] - g = item + else: + g = self.find_group(item) + if g: i = group - except KeyError: - pass else: - g = group + g = self.find_group(group) i = item if g: diff --git a/trunk/import/migration/tasks/ethnic.py b/trunk/import/migration/tasks/ethnic.py index 01f3d40..f0db630 100644 --- a/trunk/import/migration/tasks/ethnic.py +++ b/trunk/import/migration/tasks/ethnic.py @@ -72,11 +72,13 @@ class EthnicGroupsMigrator(DataMigrator): self.data.append_item(row[0], row[1], detect_group=True) def insert(self): + self.target("DELETE FROM ethnic_groups") + self.target("DELETE FROM ethnic_group_aliases") for group in self.data.groups: - self.target_cursor.execute("INSERT INTO ethnic_groups (name) VALUES(%s)", (group,)) + self.target_cursor.execute("INSERT INTO ethnic_groups (value) VALUES(%s)", (group,)) id = self.target_db.insert_id() for alias in self.data.groups[group]: - self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, name) "+ + self.target_cursor.execute("INSERT INTO ethnic_group_aliases (ethnic_group_id, value) "+ "VALUES(%s, %s)", (id, alias)) def process(self): diff --git a/trunk/import/migration/tasks/geoethno.py b/trunk/import/migration/tasks/geoethno.py index 5abc8ca..5b26a61 100644 --- a/trunk/import/migration/tasks/geoethno.py +++ b/trunk/import/migration/tasks/geoethno.py @@ -216,7 +216,7 @@ class GeoEthnoImporter(DataMigrator): self.end() class GeoEthnoAncestryBuilder(DataMigrator): - """Update indirect location ancestry relations""" + """Build indirect location ancestry relations""" implements(IDataMigrator) diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py index dc6a751..1b20870 100644 --- a/trunk/import/migration/tasks/items.py +++ b/trunk/import/migration/tasks/items.py @@ -111,7 +111,7 @@ class ItemsEnumMapper(EnumMapper): implements(IDataMigrator) map = [ - ('Ethnie_GrSocial', 'ethnic_group:name'), + ('Ethnie_GrSocial', 'ethnic_group'), ('Form_Genr_Style', 'vernacular_style'), ('FormStyl generi', 'generic_style') ] diff --git a/trunk/import/migration/tasks/unaccent.py b/trunk/import/migration/tasks/unaccent.py new file mode 100644 index 0000000..7c5757d --- /dev/null +++ b/trunk/import/migration/tasks/unaccent.py @@ -0,0 +1,71 @@ +# This file by Fredrik Lundh from: +# http://effbot.org/zone/unicode-convert.htm +# http://effbot.python-hosting.com/file/stuff/sandbox/text/unaccent.py + +# use a dynamically populated translation dictionary to remove accents +# from a string + +import unicodedata, sys + +CHAR_REPLACEMENT = { + # latin-1 characters that don't have a unicode decomposition + 0xc6: u"AE", # LATIN CAPITAL LETTER AE + 0xd0: u"D", # LATIN CAPITAL LETTER ETH + 0xd8: u"OE", # LATIN CAPITAL LETTER O WITH STROKE + 0xde: u"Th", # LATIN CAPITAL LETTER THORN + 0xdf: u"ss", # LATIN SMALL LETTER SHARP S + 0xe6: u"ae", # LATIN SMALL LETTER AE + 0xf0: u"d", # LATIN SMALL LETTER ETH + 0xf8: u"oe", # LATIN SMALL LETTER O WITH STROKE + 0xfe: u"th", # LATIN SMALL LETTER THORN + } + +## +# Translation dictionary. Translation entries are added to this +# dictionary as needed. + +class UnaccentedMap(dict): + + ## + # Maps a unicode character code (the key) to a replacement code + # (either a character code or a unicode string). + + def mapchar(self, key): + ch = self.get(key) + if ch is not None: + return ch + de = unicodedata.decomposition(unichr(key)) + if de: + try: + ch = int(de.split(None, 1)[0], 16) + except (IndexError, ValueError): + ch = key + else: + ch = CHAR_REPLACEMENT.get(key, key) + self[key] = ch + return ch + + if sys.version >= "2.5": + # use __missing__ where available + __missing__ = mapchar + else: + # otherwise, use standard __getitem__ hook (this is slower, + # since it's called for each character) + __getitem__ = mapchar + + +_map = UnaccentedMap() + +def unaccent(str): + return str.translate(_map) + +def unaccent_icmp(str1, str2): + str1 = unaccent(str1).lower() + str2 = unaccent(str2).lower() + if str1 > str2: + return 1 + + if str1 < str2: + return -1 + + return 0