From: olivier Date: Mon, 1 Feb 2010 10:37:37 +0000 (+0000) Subject: CREM database and migrator: use numeric ids for locations, and store full ancestralit... X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=88cd26717adf5c7bdbd93ae9f61be76d48050a72;p=telemeta-data.git CREM database and migrator: use numeric ids for locations, and store full ancestrality in location_relations for fast 'nested' searches git-svn-id: http://svn.parisson.org/svn/crem@144 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- diff --git a/trunk/docref/crem.sql b/trunk/docref/crem.sql index f34da5f..ea9b40e 100644 --- a/trunk/docref/crem.sql +++ b/trunk/docref/crem.sql @@ -120,39 +120,44 @@ CREATE TABLE publisher_collections ( -- CREATE TABLE location_types ( - id VARCHAR(64) NOT NULL PRIMARY KEY, + id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + code VARCHAR(64) NOT NULL UNIQUE, name VARCHAR(150) NOT NULL ) CHARACTER SET='utf8' ENGINE=InnoDB; CREATE TABLE locations ( - name VARCHAR(150) NOT NULL PRIMARY KEY, - type ENUM('country', 'continent', 'other') NOT NULL, - complete_type_id VARCHAR(64) NOT NULL, - current_name VARCHAR(150), -- recursif + id INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(150) NOT NULL UNIQUE, + type INTEGER NOT NULL DEFAULT 0, -- 1=continent, 2=country + complete_type_id INTEGER NOT NULL, + current_location_id INTEGER, -- recursif is_authoritative BOOLEAN NOT NULL DEFAULT 0, - FOREIGN KEY(current_name) REFERENCES locations (name) ON DELETE SET NULL ON UPDATE CASCADE, + INDEX(type), + FOREIGN KEY(current_location_id) REFERENCES locations (id) ON DELETE SET NULL ON UPDATE CASCADE, FOREIGN KEY(complete_type_id) REFERENCES location_types (id) ON DELETE CASCADE ) CHARACTER SET='utf8' ENGINE=InnoDB; CREATE TABLE location_aliases ( - location_name VARCHAR(150) NOT NULL, + location_id INTEGER NOT NULL, alias VARCHAR(150) NOT NULL, is_authoritative BOOLEAN NOT NULL DEFAULT 0, - PRIMARY KEY(location_name, alias), - FOREIGN KEY(location_name) REFERENCES locations (name) + PRIMARY KEY(location_id, alias), + FOREIGN KEY(location_id) REFERENCES locations (id) ON DELETE CASCADE ON UPDATE CASCADE ) CHARACTER SET='utf8' ENGINE=InnoDB; CREATE TABLE location_relations ( - location_name VARCHAR(150) NOT NULL, - parent_location_name VARCHAR(150), + location_id INTEGER NOT NULL, + ancestor_location_id INTEGER NOT NULL, + is_direct BOOLEAN NOT NULL DEFAULT FALSE, - PRIMARY KEY(location_name, parent_location_name), - FOREIGN KEY(location_name) REFERENCES locations (name) + PRIMARY KEY(location_id, ancestor_location_id), + INDEX(is_direct), + FOREIGN KEY(location_id) REFERENCES locations (id) ON DELETE CASCADE ON UPDATE CASCADE, - FOREIGN KEY(parent_location_name) REFERENCES locations (name) + FOREIGN KEY(ancestor_location_id) REFERENCES locations (id) ON DELETE CASCADE ON UPDATE CASCADE ) CHARACTER SET='utf8' ENGINE=InnoDB; @@ -251,7 +256,7 @@ CREATE TABLE media_items ( approx_duration TIME NOT NULL DEFAULT 0, recorded_from_date DATE NOT NULL DEFAULT 0, recorded_to_date DATE NOT NULL DEFAULT 0, - location_name VARCHAR(250), + location_id INTEGER, location_comment VARCHAR(250) NOT NULL DEFAULT '', ethnic_group_id INTEGER, title VARCHAR(250) NOT NULL, @@ -273,7 +278,7 @@ CREATE TABLE media_items ( FOREIGN KEY(collection_id) REFERENCES media_collections (id) ON DELETE CASCADE, - FOREIGN KEY(location_name) REFERENCES locations (name) + FOREIGN KEY(location_id) REFERENCES locations (id) ON DELETE SET NULL, FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id) ON DELETE SET NULL, diff --git a/trunk/import/migration/tasks/geoethno.py b/trunk/import/migration/tasks/geoethno.py index 7caa6f5..2944451 100644 --- a/trunk/import/migration/tasks/geoethno.py +++ b/trunk/import/migration/tasks/geoethno.py @@ -44,6 +44,10 @@ class GeoEthnoImporter(DataMigrator): implements(IDataMigrator) + TYPE_OTHER = 0 + TYPE_CONTINENT = 1 + TYPE_COUNTRY = 2 + def setup(self, cfg, src_db, target_db, logger): super(GeoEthnoImporter, self).setup(cfg, src_db, target_db, logger) self.cursor = self.target_cursor @@ -82,54 +86,69 @@ class GeoEthnoImporter(DataMigrator): raise e return inserted - + + def getone(self, query, args=None): + self.cursor.execute(query, args) + return self.cursor.fetchone()[0] def insert_location(self, name, type, parentName, historic_names): if type == 'CONTINENT': - short_type = 'continent' + short_type = self.TYPE_CONTINENT elif type == 'BASEADM': - short_type = 'country' + short_type = self.TYPE_COUNTRY else: - short_type = 'other' + short_type = self.TYPE_OTHER - self.register_type(type) + type_id = self.register_type(type) self.stats['locations'] += self.replace("INSERT INTO locations "+ - "(name, type, complete_type_id, current_name, is_authoritative) "+ - "VALUES (%s, %s, %s, %s, %s)", (name, short_type, type, name, 1)) + "(name, type, complete_type_id, is_authoritative) "+ + "VALUES (%s, %s, %s, %s)", (name, short_type, type_id, 1)) + + id = self.getone("SELECT id FROM locations WHERE name = %s", (name,)) + + self.cursor.execute("UPDATE locations SET current_location_id = %s WHERE id = %s", (id, id)); - if (len(parentName)): + if (parentName): + parent_id = self.getone("SELECT id FROM locations WHERE name = %s", (parentName,)) self.stats['relations'] += self.replace("INSERT INTO location_relations "+ - "(location_name, parent_location_name) "+ - "VALUE (%s, %s)", (name, parentName)) + "(location_id, ancestor_location_id, is_direct) "+ + "VALUE (%s, %s, %s)", + (id, parent_id, 1)) for hname in historic_names: self.stats['historical names'] += self.replace("INSERT INTO locations "+ - "(name, type, complete_type_id, current_name, is_authoritative) "+ - "VALUES (%s, %s, %s, %s, %s)", (hname, short_type, type, name, 1)) + "(name, type, complete_type_id, current_location_id, is_authoritative) "+ + "VALUES (%s, %s, %s, %s, %s)", (hname, short_type, type_id, id, 1)) + hid = self.getone("SELECT id FROM locations WHERE name = %s", (hname,)) if (len(parentName)): self.stats['relations'] += self.replace("INSERT INTO location_relations "+ - "(location_name, parent_location_name) "+ - "VALUE (%s, %s)", (hname, parentName)) + "(location_id, ancestor_location_id, is_direct) "+ + "VALUE (%s, %s, %s)", + (hid, parent_id, 1)) def add_aliases(self, name, items): for alias in items: + id = self.getone("SELECT id FROM locations WHERE name = %s", (name,)) self.stats['aliases'] += self.replace("INSERT INTO location_aliases "+ - "(location_name, alias, is_authoritative) "+ - "VALUES (%s, %s, %s)", (name, alias, 1)) + "(location_id, alias, is_authoritative) "+ + "VALUES (%s, %s, %s)", (id, alias, 1)) - def register_type(self, id): + def register_type(self, code): try: - self.known_types.index(id) + id = self.known_types.index(code) + 1 except ValueError: - self.cursor.execute("INSERT INTO location_types (id, name) "+ - "VALUES (%s, %s)", (id,"")) - self.known_types.append(id) + self.known_types.append(code) + id = self.known_types.index(code) + 1 + self.cursor.execute("INSERT INTO location_types (id, code, name) "+ + "VALUES (%s, %s, %s)", (id, code, code)) self.stats['types'] += 1 + return id + def is_empty(self, node): for n in node.childNodes: if n.nodeType == dom.Node.TEXT_NODE: @@ -194,6 +213,59 @@ class GeoEthnoImporter(DataMigrator): self.process_children(self.dom.getElementsByTagName('GEOETHNO')[0], '') self.end() +class GeoEthnoAncestryBuilder(DataMigrator): + """Update indirect location ancestry relations""" + + implements(IDataMigrator) + + def get_name(self): + return "geoethno:ancestry" + + def getone(self, query, args=None): + self.target_cursor.execute(query, args) + return self.target_cursor.fetchone()[0] + + def get_ancestors(self, cursor, id): + cursor.execute("SELECT ancestor_location_id FROM location_relations " + "WHERE location_id = %s AND is_direct = 1", (id,)) + ancestors = [] + while True: + row = cursor.fetchone() + if not row: + break + id, = row + ancestors.append(id) + up = self.get_ancestors(cursor, id) + if up: + ancestors.extend(up) + return ancestors + + def process(self): + self.target("DELETE FROM location_relations WHERE is_direct = 0") + ndirect = self.getone("SELECT count(*) FROM location_relations") + self.stats = {'direct' : ndirect, 'indirect': 0, 'total': ndirect} + self.start(ndirect) + rcursor1 = self.target_db.cursor() + rcursor2 = self.target_db.cursor() + rcursor1.execute("SELECT DISTINCT(location_id) FROM location_relations WHERE is_direct = 1") + while True: + row = rcursor1.fetchone() + if not row: + break + + id, = row + ancestors = self.get_ancestors(rcursor2, id) + if len(ancestors) > 1: + for aid in ancestors[1:]: + self.target("INSERT INTO location_relations (location_id, ancestor_location_id) " + "VALUE (%s, %s)", (id, aid)) + self.stats['indirect'] += 1 + self.stats['total'] += 1 + + self.step() + + self.end() + class Error(Exception): def __init__(self, importer, msg): diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py index ece7f61..86199fc 100644 --- a/trunk/import/migration/tasks/items.py +++ b/trunk/import/migration/tasks/items.py @@ -34,6 +34,7 @@ from telemeta.core import * from api import IDataMigrator from core import DataMigrator, EnumMapper +from geoethno import GeoEthnoImporter import re import sys @@ -195,30 +196,19 @@ class ItemsLocationsMapper(DataMigrator): def get_name(self): return "items:locations" - def is_descendant_of(self, location, ascendant, ascendant_type = None): - self.target("SELECT l.name, l.type FROM location_relations AS r " - "INNER JOIN locations AS l ON r.parent_location_name = l.name " - "WHERE r.location_name = %s", - (location,)) - while True: - row = self.target_cursor.fetchone() - if not row: - break - parent_name, parent_type = row - if parent_name == ascendant and ((not ascendant_type) or parent_type == ascendant_type): - return True - else: - if self.is_descendant_of(parent_name, ascendant, ascendant_type): - return True - return False + def is_descendant_of(self, location, ascendant): + self.target("SELECT COUNT(*) FROM location_relations " + "WHERE location_id = %s AND ancestor_location_id = %s", + (location, ascendant)) + return bool(self.target_cursor.fetchone()[0]) def find_location(self, name_or_alias, type = None): - select = "SELECT name, type FROM locations AS l INNER JOIN location_aliases AS a ON l.name = a.location_name" + select = "SELECT id, type FROM locations AS l INNER JOIN location_aliases AS a ON l.id = a.location_id " if type: - self.target(select + " WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)", + self.target(select + "WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)", (type, name_or_alias, name_or_alias)) else: - self.target(select + " WHERE l.name LIKE %s OR a.alias LIKE %s", + self.target(select + "WHERE l.name LIKE %s OR a.alias LIKE %s", (name_or_alias, name_or_alias)) if self.target_cursor.rowcount: @@ -267,7 +257,7 @@ class ItemsLocationsMapper(DataMigrator): return (found, found_type) def process(self): - self.target("UPDATE media_items SET location_name = NULL, location_comment = ''") + self.target("UPDATE media_items SET location_id = NULL, location_comment = ''") self.src("SELECT Cote_Phono, Continent, Etat, Region_Village FROM Phono") self.stats = { @@ -296,12 +286,12 @@ class ItemsLocationsMapper(DataMigrator): unmatched = [] location, type = self.find_location_by_sequence(sequence, matched, unmatched) if location: - if type == 'continent': + if type == GeoEthnoImporter.TYPE_CONTINENT: self.stats['continent'] += 1 self.warn("Unable to find GeoEthno country in sequence: %s" % str(sequence)) - elif type == 'country': + elif type == GeoEthnoImporter.TYPE_COUNTRY: self.stats['country'] += 1 - elif type == 'other': + elif type == GeoEthnoImporter.TYPE_OTHER: self.stats['fullmap'] += 1 #sys.stdout.write("fullmap: %s\n" % str(matched)) else: @@ -311,7 +301,7 @@ class ItemsLocationsMapper(DataMigrator): #sys.stdout.write("nomap: %s\n" % str(unmatched)) comment = ", ".join(unmatched) - self.target("UPDATE media_items SET location_name = %s, location_comment = %s WHERE old_code = %s", + self.target("UPDATE media_items SET location_id = %s, location_comment = %s WHERE old_code = %s", (location, comment, oldcode)) else: self.stats['empty'] += 1