]> git.parisson.com Git - telemeta-data.git/commitdiff
CREM database and migrator: use numeric ids for locations, and store full ancestralit...
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Mon, 1 Feb 2010 10:37:37 +0000 (10:37 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Mon, 1 Feb 2010 10:37:37 +0000 (10:37 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@144 3bf09e05-f825-4182-b9bc-eedd7160adf0

trunk/docref/crem.sql
trunk/import/migration/tasks/geoethno.py
trunk/import/migration/tasks/items.py

index f34da5f94d467fb979172f6f5818f79f2b7d3c42..ea9b40e3ffb6a0db79294868c6f3cf8671d1d81b 100644 (file)
@@ -120,39 +120,44 @@ CREATE TABLE publisher_collections (
 --
 
 CREATE TABLE location_types (
-    id              VARCHAR(64) NOT NULL PRIMARY KEY,
+    id              INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, 
+    code            VARCHAR(64) NOT NULL UNIQUE,
     name            VARCHAR(150) NOT NULL
 ) CHARACTER SET='utf8' ENGINE=InnoDB;    
 
 CREATE TABLE locations (
-    name                VARCHAR(150) NOT NULL PRIMARY KEY,
-    type                ENUM('country', 'continent', 'other') NOT NULL,
-    complete_type_id    VARCHAR(64) NOT NULL,
-    current_name        VARCHAR(150), -- recursif
+    id                  INTEGER NOT NULL AUTO_INCREMENT PRIMARY KEY, 
+    name                VARCHAR(150) NOT NULL UNIQUE,
+    type                INTEGER NOT NULL DEFAULT 0, -- 1=continent, 2=country
+    complete_type_id    INTEGER NOT NULL,
+    current_location_id INTEGER, -- recursif
     is_authoritative    BOOLEAN NOT NULL DEFAULT 0,
 
-    FOREIGN KEY(current_name) REFERENCES locations (name) ON DELETE SET NULL ON UPDATE CASCADE,
+    INDEX(type),
+    FOREIGN KEY(current_location_id) REFERENCES locations (id) ON DELETE SET NULL ON UPDATE CASCADE,
     FOREIGN KEY(complete_type_id) REFERENCES location_types (id) ON DELETE CASCADE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
 CREATE TABLE location_aliases (
-    location_name       VARCHAR(150) NOT NULL,
+    location_id         INTEGER NOT NULL,
     alias               VARCHAR(150) NOT NULL,
     is_authoritative    BOOLEAN NOT NULL DEFAULT 0,
 
-    PRIMARY KEY(location_name, alias),
-    FOREIGN KEY(location_name) REFERENCES locations (name
+    PRIMARY KEY(location_id, alias),
+    FOREIGN KEY(location_id) REFERENCES locations (id
         ON DELETE CASCADE ON UPDATE CASCADE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
 CREATE TABLE location_relations (
-    location_name           VARCHAR(150) NOT NULL,
-    parent_location_name    VARCHAR(150),
+    location_id             INTEGER NOT NULL,
+    ancestor_location_id    INTEGER NOT NULL,
+    is_direct               BOOLEAN NOT NULL DEFAULT FALSE,
 
-    PRIMARY KEY(location_name, parent_location_name),
-    FOREIGN KEY(location_name) REFERENCES locations (name)
+    PRIMARY KEY(location_id, ancestor_location_id),
+    INDEX(is_direct),
+    FOREIGN KEY(location_id) REFERENCES locations (id)
         ON DELETE CASCADE ON UPDATE CASCADE,
-    FOREIGN KEY(parent_location_name) REFERENCES locations (name)
+    FOREIGN KEY(ancestor_location_id) REFERENCES locations (id)
         ON DELETE CASCADE ON UPDATE CASCADE
 ) CHARACTER SET='utf8' ENGINE=InnoDB;
 
@@ -251,7 +256,7 @@ CREATE TABLE media_items (
     approx_duration         TIME NOT NULL DEFAULT 0,
     recorded_from_date      DATE NOT NULL DEFAULT 0,
     recorded_to_date        DATE NOT NULL DEFAULT 0,
-    location_name           VARCHAR(250),
+    location_id             INTEGER,
     location_comment        VARCHAR(250) NOT NULL DEFAULT '',
     ethnic_group_id         INTEGER,
     title                   VARCHAR(250) NOT NULL,
@@ -273,7 +278,7 @@ CREATE TABLE media_items (
 
     FOREIGN KEY(collection_id) REFERENCES media_collections (id)
         ON DELETE CASCADE,
-    FOREIGN KEY(location_name) REFERENCES locations (name)
+    FOREIGN KEY(location_id) REFERENCES locations (id)
         ON DELETE SET NULL,
     FOREIGN KEY(ethnic_group_id) REFERENCES ethnic_groups (id)
         ON DELETE SET NULL,
index 7caa6f58d63e610f3a0925b41e95ba1928b9ebf6..294445161d8c2b1d30b1821eb4b55747cea7e415 100644 (file)
@@ -44,6 +44,10 @@ class GeoEthnoImporter(DataMigrator):
 
     implements(IDataMigrator)
 
+    TYPE_OTHER = 0
+    TYPE_CONTINENT = 1
+    TYPE_COUNTRY   = 2
+
     def setup(self, cfg, src_db, target_db, logger):
         super(GeoEthnoImporter, self).setup(cfg, src_db, target_db, logger)
         self.cursor = self.target_cursor
@@ -82,54 +86,69 @@ class GeoEthnoImporter(DataMigrator):
                 raise e
 
         return inserted                
-        
+       
+    def getone(self, query, args=None):
+        self.cursor.execute(query, args)
+        return self.cursor.fetchone()[0]
 
     def insert_location(self, name, type, parentName, historic_names):
         if type == 'CONTINENT':
-            short_type = 'continent'
+            short_type = self.TYPE_CONTINENT
         elif type == 'BASEADM':
-            short_type = 'country'
+            short_type = self.TYPE_COUNTRY
         else:
-            short_type = 'other'
+            short_type = self.TYPE_OTHER
 
-        self.register_type(type)
+        type_id = self.register_type(type)
 
         self.stats['locations'] += self.replace("INSERT INTO locations "+
-                                                "(name, type, complete_type_id, current_name, is_authoritative) "+
-                                                "VALUES (%s, %s, %s, %s, %s)", (name, short_type, type, name, 1))
+                                                "(name, type, complete_type_id, is_authoritative) "+
+                                                "VALUES (%s, %s, %s, %s)", (name, short_type, type_id, 1))
+
+        id = self.getone("SELECT id FROM locations WHERE name = %s", (name,))
+
+        self.cursor.execute("UPDATE locations SET current_location_id = %s WHERE id = %s", (id, id));                       
 
-        if (len(parentName)):
+        if (parentName):
+            parent_id = self.getone("SELECT id FROM locations WHERE name = %s", (parentName,))
             self.stats['relations'] += self.replace("INSERT INTO location_relations "+
-                                                   "(location_name, parent_location_name) "+
-                                                   "VALUE (%s, %s)", (name, parentName))
+                                                   "(location_id, ancestor_location_id, is_direct) "+
+                                                   "VALUE (%s, %s, %s)", 
+                                                   (id, parent_id, 1))
 
         for hname in historic_names:
             self.stats['historical names'] += self.replace("INSERT INTO locations "+
-                                                           "(name, type, complete_type_id, current_name, is_authoritative) "+
-                                                           "VALUES (%s, %s, %s, %s, %s)", (hname, short_type, type, name, 1))
+                                                           "(name, type, complete_type_id, current_location_id, is_authoritative) "+
+                                                           "VALUES (%s, %s, %s, %s, %s)", (hname, short_type, type_id, id, 1))
+            hid = self.getone("SELECT id FROM locations WHERE name = %s", (hname,))
             if (len(parentName)):
                 self.stats['relations'] += self.replace("INSERT INTO location_relations "+
-                                                        "(location_name, parent_location_name) "+
-                                                        "VALUE (%s, %s)", (hname, parentName))
+                                                        "(location_id, ancestor_location_id, is_direct) "+
+                                                        "VALUE (%s, %s, %s)", 
+                                                        (hid, parent_id, 1))
                     
 
     def add_aliases(self, name, items):
         for alias in items:
+            id = self.getone("SELECT id FROM locations WHERE name = %s", (name,))
             self.stats['aliases'] += self.replace("INSERT INTO location_aliases "+
-                                                  "(location_name, alias, is_authoritative) "+
-                                                  "VALUES (%s, %s, %s)", (name, alias, 1))
+                                                  "(location_id, alias, is_authoritative) "+
+                                                  "VALUES (%s, %s, %s)", (id, alias, 1))
                 
 
 
-    def register_type(self, id):
+    def register_type(self, code):
         try:
-            self.known_types.index(id)
+            id = self.known_types.index(code) + 1
         except ValueError:
-            self.cursor.execute("INSERT INTO location_types (id, name) "+
-                                "VALUES (%s, %s)", (id,""))
-            self.known_types.append(id)                                
+            self.known_types.append(code)                      
+            id = self.known_types.index(code) + 1
+            self.cursor.execute("INSERT INTO location_types (id, code, name) "+
+                                "VALUES (%s, %s, %s)", (id, code, code))
             self.stats['types'] += 1
 
+        return id            
+
     def is_empty(self, node):
         for n in node.childNodes:
             if n.nodeType == dom.Node.TEXT_NODE:
@@ -194,6 +213,59 @@ class GeoEthnoImporter(DataMigrator):
         self.process_children(self.dom.getElementsByTagName('GEOETHNO')[0], '')
         self.end()
 
+class GeoEthnoAncestryBuilder(DataMigrator):
+    """Update indirect location ancestry relations"""
+
+    implements(IDataMigrator)
+
+    def get_name(self):
+        return "geoethno:ancestry"
+
+    def getone(self, query, args=None):
+        self.target_cursor.execute(query, args)
+        return self.target_cursor.fetchone()[0]
+
+    def get_ancestors(self, cursor, id):
+        cursor.execute("SELECT ancestor_location_id FROM location_relations "
+                       "WHERE location_id = %s AND is_direct = 1", (id,))
+        ancestors = []
+        while True:
+            row = cursor.fetchone()
+            if not row:
+                break
+            id, = row
+            ancestors.append(id)
+            up = self.get_ancestors(cursor, id)
+            if up:
+                ancestors.extend(up)
+        return ancestors                
+
+    def process(self):
+        self.target("DELETE FROM location_relations WHERE is_direct = 0")
+        ndirect = self.getone("SELECT count(*) FROM location_relations")
+        self.stats = {'direct' : ndirect, 'indirect': 0, 'total': ndirect}
+        self.start(ndirect)
+        rcursor1 = self.target_db.cursor()
+        rcursor2 = self.target_db.cursor()
+        rcursor1.execute("SELECT DISTINCT(location_id) FROM location_relations WHERE is_direct = 1")
+        while True:
+            row = rcursor1.fetchone()
+            if not row:
+                break
+
+            id, = row
+            ancestors = self.get_ancestors(rcursor2, id)
+            if len(ancestors) > 1:
+                for aid in ancestors[1:]:
+                    self.target("INSERT INTO location_relations (location_id, ancestor_location_id) "
+                                "VALUE (%s, %s)", (id, aid))
+                    self.stats['indirect'] += 1
+                    self.stats['total']    += 1                                
+
+            self.step()                                
+            
+        self.end()
+
 
 class Error(Exception):
     def __init__(self, importer, msg):
index ece7f613238d19ffec5ed241c9ac0021fe74f8fc..86199fc11cf275f009af705aff4f80c991ebf943 100644 (file)
@@ -34,6 +34,7 @@
 from telemeta.core import *
 from api import IDataMigrator
 from core import DataMigrator, EnumMapper
+from geoethno import GeoEthnoImporter
 import re
 import sys
 
@@ -195,30 +196,19 @@ class ItemsLocationsMapper(DataMigrator):
     def get_name(self):
         return "items:locations"
 
-    def is_descendant_of(self, location, ascendant, ascendant_type = None):
-        self.target("SELECT l.name, l.type FROM location_relations AS r "
-                    "INNER JOIN locations AS l ON r.parent_location_name = l.name "
-                    "WHERE r.location_name = %s",
-                     (location,))
-        while True:
-            row = self.target_cursor.fetchone()
-            if not row:
-                break
-            parent_name, parent_type = row
-            if parent_name == ascendant and ((not ascendant_type) or parent_type == ascendant_type):
-                return True
-            else:
-                if self.is_descendant_of(parent_name, ascendant, ascendant_type):
-                    return True
-        return False
+    def is_descendant_of(self, location, ascendant):
+        self.target("SELECT COUNT(*) FROM location_relations "
+                    "WHERE location_id = %s AND ancestor_location_id = %s",
+                    (location, ascendant))
+        return bool(self.target_cursor.fetchone()[0])                    
 
     def find_location(self, name_or_alias, type = None):
-        select = "SELECT name, type FROM locations AS l INNER JOIN location_aliases AS a ON l.name = a.location_name"
+        select = "SELECT id, type FROM locations AS l INNER JOIN location_aliases AS a ON l.id = a.location_id "
         if type:
-            self.target(select + " WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)",
+            self.target(select + "WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)",
                         (type, name_or_alias, name_or_alias))
         else:
-            self.target(select + " WHERE l.name LIKE %s OR a.alias LIKE %s",
+            self.target(select + "WHERE l.name LIKE %s OR a.alias LIKE %s",
                         (name_or_alias, name_or_alias))
             
         if self.target_cursor.rowcount:
@@ -267,7 +257,7 @@ class ItemsLocationsMapper(DataMigrator):
         return (found, found_type)
 
     def process(self):
-        self.target("UPDATE media_items SET location_name = NULL, location_comment = ''")
+        self.target("UPDATE media_items SET location_id = NULL, location_comment = ''")
 
         self.src("SELECT Cote_Phono, Continent, Etat, Region_Village FROM Phono")
         self.stats = {
@@ -296,12 +286,12 @@ class ItemsLocationsMapper(DataMigrator):
                 unmatched = []
                 location, type = self.find_location_by_sequence(sequence, matched, unmatched)
                 if location:
-                    if type == 'continent':
+                    if type == GeoEthnoImporter.TYPE_CONTINENT:
                         self.stats['continent'] += 1
                         self.warn("Unable to find GeoEthno country in sequence: %s" % str(sequence))
-                    elif type == 'country':
+                    elif type == GeoEthnoImporter.TYPE_COUNTRY:
                         self.stats['country'] += 1
-                    elif type == 'other':
+                    elif type == GeoEthnoImporter.TYPE_OTHER:
                         self.stats['fullmap'] += 1
                         #sys.stdout.write("fullmap: %s\n" % str(matched))                
                     else:
@@ -311,7 +301,7 @@ class ItemsLocationsMapper(DataMigrator):
                     #sys.stdout.write("nomap: %s\n" % str(unmatched))                
 
                 comment = ", ".join(unmatched)
-                self.target("UPDATE media_items SET location_name = %s, location_comment = %s WHERE old_code = %s",
+                self.target("UPDATE media_items SET location_id = %s, location_comment = %s WHERE old_code = %s",
                             (location, comment, oldcode))
             else:
                 self.stats['empty'] += 1