From: olivier Date: Thu, 9 Jul 2009 11:21:15 +0000 (+0000) Subject: migration: improve item geo mapper (about x4 more efficient) X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=c86e95f8aa23c388dcdb9cf5b019e91923cb3be4;p=telemeta-data.git migration: improve item geo mapper (about x4 more efficient) git-svn-id: http://svn.parisson.org/svn/crem@123 3bf09e05-f825-4182-b9bc-eedd7160adf0 --- diff --git a/trunk/import/migration/tasks/items.py b/trunk/import/migration/tasks/items.py index 032c49c..179e35e 100644 --- a/trunk/import/migration/tasks/items.py +++ b/trunk/import/migration/tasks/items.py @@ -34,6 +34,8 @@ from telemeta.core import * from api import IDataMigrator from core import DataMigrator, EnumMapper +import re +import sys class ItemsCopyMigrator(DataMigrator): """Perform a preliminary raw copy of the item table""" @@ -193,7 +195,7 @@ class ItemsLocationsMapper(DataMigrator): def get_name(self): return "items:locations" - def is_descendant_of(self, location, ascendant, ascendant_type): + def is_descendant_of(self, location, ascendant, ascendant_type = None): self.target("SELECT l.name, l.type FROM location_relations AS r " "INNER JOIN locations AS l ON r.parent_location_name = l.name " "WHERE r.location_name = %s", @@ -203,20 +205,25 @@ class ItemsLocationsMapper(DataMigrator): if not row: break parent_name, parent_type = row - if parent_name == ascendant and parent_type == ascendant_type: + if parent_name == ascendant and ((not ascendant_type) or parent_type == ascendant_type): return True else: if self.is_descendant_of(parent_name, ascendant, ascendant_type): return True return False - def find_location(self, name_or_alias, type): - self.target("SELECT name FROM locations AS l INNER JOIN location_aliases AS a ON l.name = a.location_name " - "WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)", - (type, name_or_alias, name_or_alias)) + def find_location(self, name_or_alias, type = None): + select = "SELECT name, type FROM locations AS l INNER JOIN location_aliases AS a ON l.name = a.location_name" + if type: + self.target(select + " WHERE l.type = %s AND (l.name LIKE %s OR a.alias LIKE %s)", + (type, name_or_alias, name_or_alias)) + else: + self.target(select + " WHERE l.name LIKE %s OR a.alias LIKE %s", + (name_or_alias, name_or_alias)) + if self.target_cursor.rowcount: - return self.target_cursor.fetchone()[0] - return None + return self.target_cursor.fetchone() + return (None, None) def concat(self, locality, country, continent = None): pieces = [] @@ -229,6 +236,33 @@ class ItemsLocationsMapper(DataMigrator): return u'-'.join(pieces) + def parse_location_str(self, str): + str = re.sub("\*", "", str.strip()) + str = re.sub(" +", " ", str) + return re.split(" *[,;/] *", str) + + def find_location_by_sequence(self, sequence, matched = None, unmatched = None): + found = None + found_type = None + broken = False + for l in sequence: + if not broken: + location, type = self.find_location(l) + if not location: + broken = True + if not found or self.is_descendant_of(location, found): + found = location + found_type = type + if not matched is None: + matched.append(found) + else: + broken = True + if broken: + if not unmatched is None: + unmatched.append(l) + + return (found, found_type) + def process(self): self.target("UPDATE media_items SET location_name = NULL, location_comment = ''") @@ -249,36 +283,33 @@ class ItemsLocationsMapper(DataMigrator): if not row: break oldcode, continent, country, locality = row - continent = continent.strip() - country = country.strip() - locality = locality.strip() - - comment = '' - location = self.find_location(continent, "continent") - if location: - c = self.find_location(country, "country") - if c and self.is_descendant_of(c, location, 'continent'): - location = c - l = self.find_location(locality, "other") - if l and self.is_descendant_of(l, location, 'country'): - location = l + sequence = [] + sequence.extend(self.parse_location_str(continent)) + sequence.extend(self.parse_location_str(country)) + sequence.extend(self.parse_location_str(locality)) + + if sequence: + matched = [] + unmatched = [] + location, type = self.find_location_by_sequence(sequence, matched, unmatched) + if location: + if type == 'continent': + self.stats['continent'] += 1 + elif type == 'country': + self.stats['country'] += 1 + elif type == 'other': self.stats['fullmap'] += 1 + #sys.stdout.write("fullmap: %s\n" % str(matched)) else: - comment = locality - self.stats['country'] += 1 + raise "Undetermined location type: %s" % type else: - comment = self.concat(locality, country) - self.stats['continent'] += 1 - else: - comment = self.concat(locality, country, continent) - - if location or comment: + self.stats['nomap'] += 1 + + comment = ", ".join(unmatched) self.target("UPDATE media_items SET location_name = %s, location_comment = %s WHERE old_code = %s", (location, comment, oldcode)) - elif not comment: - self.stats['empty'] += 1 else: - self.stats['nomap'] += 1 + self.stats['empty'] += 1 self.step()