]> git.parisson.com Git - telemeta-data.git/commitdiff
migration: improve item geo mapper (about x4 more efficient)
authorolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Thu, 9 Jul 2009 11:21:15 +0000 (11:21 +0000)
committerolivier <olivier@3bf09e05-f825-4182-b9bc-eedd7160adf0>
Thu, 9 Jul 2009 11:21:15 +0000 (11:21 +0000)
git-svn-id: http://svn.parisson.org/svn/crem@123 3bf09e05-f825-4182-b9bc-eedd7160adf0

trunk/import/migration/tasks/items.py

index 032c49c7154d802a549dd1b396a130bd2f1d111f..179e35e15fd4c6df38815c26bbed3101780d1a0e 100644 (file)
@@ -34,6 +34,8 @@
 from telemeta.core import *
 from api import IDataMigrator
 from core import DataMigrator, EnumMapper
+import re
+import sys
 
 class ItemsCopyMigrator(DataMigrator):
     """Perform a preliminary raw copy of the item table"""
@@ -193,7 +195,7 @@ class ItemsLocationsMapper(DataMigrator):
     def get_name(self):
         return "items:locations"
 
-    def is_descendant_of(self, location, ascendant, ascendant_type):
+    def is_descendant_of(self, location, ascendant, ascendant_type = None):
         self.target("SELECT l.name, l.type FROM location_relations AS r "
                     "INNER JOIN locations AS l ON r.parent_location_name = l.name "
                     "WHERE r.location_name = %s",
@@ -203,20 +205,25 @@ class ItemsLocationsMapper(DataMigrator):
             if not row:
                 break
             parent_name, parent_type = row
-            if parent_name == ascendant and parent_type == ascendant_type:
+            if parent_name == ascendant and ((not ascendant_type) or parent_type == ascendant_type):
                 return True
             else:
                 if self.is_descendant_of(parent_name, ascendant, ascendant_type):
                     return True
         return False
 
-    def find_location(self, name_or_alias, type):
-        self.target("SELECT name FROM locations AS l INNER JOIN location_aliases AS a ON l.name = a.location_name "
-                    "WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)",
-                    (type, name_or_alias, name_or_alias))
+    def find_location(self, name_or_alias, type = None):
+        select = "SELECT name, type FROM locations AS l INNER JOIN location_aliases AS a ON l.name = a.location_name"
+        if type:
+            self.target(select + " WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)",
+                        (type, name_or_alias, name_or_alias))
+        else:
+            self.target(select + " WHERE l.name LIKE %s OR a.alias LIKE %s",
+                        (name_or_alias, name_or_alias))
+            
         if self.target_cursor.rowcount:
-            return self.target_cursor.fetchone()[0]
-        return None
+            return self.target_cursor.fetchone()
+        return (None, None)
         
     def concat(self, locality, country, continent = None):
         pieces = []
@@ -229,6 +236,33 @@ class ItemsLocationsMapper(DataMigrator):
 
         return u'-'.join(pieces)
 
+    def parse_location_str(self, str):
+        str = re.sub("\*", "", str.strip())
+        str = re.sub(" +", " ", str)
+        return re.split(" *[,;/] *", str)
+
+    def find_location_by_sequence(self, sequence, matched = None, unmatched = None):
+        found       = None
+        found_type  = None
+        broken      = False
+        for l in sequence:
+            if not broken:
+                location, type = self.find_location(l)
+                if not location:
+                    broken = True
+                if not found or self.is_descendant_of(location, found):
+                    found       = location
+                    found_type  = type
+                    if not matched is None:
+                        matched.append(found)
+                else:
+                    broken = True
+            if broken:
+                if not unmatched is None:
+                    unmatched.append(l)
+
+        return (found, found_type)
+
     def process(self):
         self.target("UPDATE media_items SET location_name = NULL, location_comment = ''")
 
@@ -249,36 +283,33 @@ class ItemsLocationsMapper(DataMigrator):
             if not row:
                 break
             oldcode, continent, country, locality = row
-            continent   = continent.strip()
-            country     = country.strip()
-            locality    = locality.strip()
-
-            comment     = ''
-            location = self.find_location(continent, "continent")
-            if location:
-                c = self.find_location(country, "country")
-                if c and self.is_descendant_of(c, location, 'continent'):
-                    location = c
-                    l = self.find_location(locality, "other")
-                    if l and self.is_descendant_of(l, location, 'country'):
-                        location = l
+            sequence = []
+            sequence.extend(self.parse_location_str(continent))
+            sequence.extend(self.parse_location_str(country))
+            sequence.extend(self.parse_location_str(locality))
+
+            if sequence:
+                matched = []
+                unmatched = []
+                location, type = self.find_location_by_sequence(sequence, matched, unmatched)
+                if location:
+                    if type == 'continent':
+                        self.stats['continent'] += 1
+                    elif type == 'country':
+                        self.stats['country'] += 1
+                    elif type == 'other':
                         self.stats['fullmap'] += 1
+                        #sys.stdout.write("fullmap: %s\n" % str(matched))                
                     else:
-                        comment = locality
-                        self.stats['country'] += 1
+                        raise "Undetermined location type: %s" % type
                 else:
-                    comment = self.concat(locality, country)
-                    self.stats['continent'] += 1
-            else:
-                comment = self.concat(locality, country, continent)
-        
-            if location or comment:
+                    self.stats['nomap'] += 1
+
+                comment = ", ".join(unmatched)
                 self.target("UPDATE media_items SET location_name = %s, location_comment = %s WHERE old_code = %s",
                             (location, comment, oldcode))
-            elif not comment:
-                self.stats['empty'] += 1
             else:
-                self.stats['nomap'] += 1
+                self.stats['empty'] += 1
 
             self.step()