From: Thomas Fillon Date: Sat, 25 Feb 2017 14:46:21 +0000 (+0100) Subject: Update management commend for importing MCM Thesaurus X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=bf7eb1d6e46bfc19d471467bc2ea2079ee5723c8;p=telemeta_mcm.git Update management commend for importing MCM Thesaurus --- diff --git a/mcm/management/commands/import_xml.py b/mcm/management/commands/import_xml.py index de7dbc4..a0f05e2 100644 --- a/mcm/management/commands/import_xml.py +++ b/mcm/management/commands/import_xml.py @@ -4,7 +4,7 @@ from django.core.management.base import BaseCommand, CommandError from ...models import Document from ...models import Notice, Disc, Video, VideoFile, BookThesis, Journal -from ...models import Article, Photo, PosterBooklet, Object +from ...models import Article, Photo, PosterBooklet, Object from ...models import Author, Keyword, Reference from ...models import Event, EventEdition, EventType, EventVenue from ...models import GeographicalClassification @@ -12,44 +12,90 @@ from ...models import GeographicalClassification #import lxml.etree.ElementTree as ET import xml.etree.ElementTree as ET import os +import logging +import tempfile + +import HTMLParser + DEBUG = False -replacements = {'auteur_affiche_(dessin)>': 'auteur_affiche_dessin>', - '[record_no]>': 'record_no>', - '<': '???<', - '&<': '?<', - '\00': '', - '\373': 'û', - 'sa lta nata': 'saltanata', - 'Gugak FM': 'Gugak FM', - '\xf5': 'ı', - '\x1e': '', - '\x02': '', - '\xf9': '?', - '\xb7': '?'} +replacements_tag = { + 'auteur_affiche_(dessin)>': 'auteur_affiche_dessin>', + 'auteur_affiche_(dessin).Documents>': 'auteur_affiche_dessin_Documents>', + '[record_no]>': 'record_no>', + '->_Intervention>': 'Auteurs_intervention>', + #'\00': '', +} + +replacements_char = { + '<': '???<', + '’': ''', + 'œ': 'œ', + '&<': '?<', + '\373': 'û', + 'sa lta nata': 'saltanata', + 'Gugak FM': 'Gugak FM', + '\xf5': 'ı', + '\x1e': '', + '\x02': '', + '\xf9': '?', + '\xb7': '?', + '\371': '', +} + def cleanup_xml(xml_file): root, ext = os.path.splitext(xml_file) - clean_xml_file = ''.join([root,'_clean', ext]) - if os.path.exists(clean_xml_file): - return clean_xml_file - line_err = None - with open(xml_file, 'U') as infile, open(clean_xml_file, 'w') as outfile: - l = 1 - for line in infile: - if l == line_err: - print repr(line) - for src, target in replacements.iteritems(): - line = line.replace(src, target) - #if line.startswith('Chant et flûte taegu'): - # print repr(line) - outfile.write(line) - l += 1 + clean_xml_file = ''.join([root, '_clean', ext]) + log_file = ''.join([root, '_clean_log.txt']) + if os.path.exists(log_file): + os.unlink(log_file) + logging.basicConfig(filename=log_file, format='%(levelname)s:%(message)s', + level=logging.DEBUG) + logging.info('Nettoyage du fichier XML %s', xml_file) + # if os.path.exists(clean_xml_file): + # return clean_xml_file + h = HTMLParser.HTMLParser() + temp_xml = tempfile.NamedTemporaryFile(delete=False) + # 1ere passe : nettoyage des tags XML + with open(xml_file, 'U') as infile: + with open(temp_xml.name, 'w') as outfile: + for line in infile: + for src, target in replacements_tag.iteritems(): + if src in line: + line = line.replace(src, target) + outfile.write(line) + + # 2nde passe : nettoyage des caractères + with open(temp_xml.name, 'U') as infile: + with open(clean_xml_file, 'w') as outfile: + lineno = 1 + for line in infile: + change_line = False + for src, target in replacements_char.iteritems(): + if src in line: + change_line = True + logging.info('Ligne : %d', lineno) + logging.info('%s -> %s', src, target) + logging.info('Ligne de texte originale: %s', line) + line = line.replace(src, target) + logging.info('Ligne de texte de remplacement : %s', line) + try: + line = h.unescape(line) + except UnicodeDecodeError as e: + print line + raise e + if change_line: + logging.info('Ligne de texte de remplacement HTML : %s', line) + outfile.write(line.encode('utf-8')) + + lineno += 1 + os.unlink(temp_xml.name) return clean_xml_file - + class Command(BaseCommand): help = 'Import items from XML' @@ -71,7 +117,6 @@ class Command(BaseCommand): erreur_date_parution = 0 erreur_date_indexation = 0 - # a-Notice spectacle # b-Disque # c-Vidéo DVD&VHS @@ -84,19 +129,17 @@ class Command(BaseCommand): # k-Pédagogique # l-Objet - DOCUMENT_CLASS = {'a-Notice spectacle': Notice, - 'b-Disque': Disc, - u'c-Vidéo DVD&VHS': Video, - u'd-Vidéo en ligne': VideoFile, - u'f-Ouvrage & Thèse': BookThesis, - 'g-Revue': Journal, - 'h-Article': Article, - 'i-Photo': Photo, - 'j-Affiche - Brochure': PosterBooklet, - 'l-Objet': Object - } - - + DOCUMENT_CLASS = {'a-Notice spectacle': Notice, + 'b-Disque': Disc, + u'c-Vidéo DVD&VHS': Video, + u'd-Vidéo en ligne': VideoFile, + u'f-Ouvrage & Thèse': BookThesis, + 'g-Revue': Journal, + 'h-Article': Article, + 'i-Photo': Photo, + 'j-Affiche - Brochure': PosterBooklet, + 'l-Objet': Object + } skip_document_types = ['e-Site Internet', 'l-Objet', u'k-Pédagogique'] @@ -104,20 +147,20 @@ class Command(BaseCommand): import HTMLParser h = HTMLParser.HTMLParser() - + for document in root.iter('Document'): - #print '------------' + # print '------------' doc_type = h.unescape(document.findtext('Type')) - #print doc_type + # print doc_type if doc_type in skip_document_types: if doc_type == 'l-Objet': document_non_traite += 1 continue - document_traite +=1 + document_traite += 1 record_no = document.findtext('record_no') - code = document.findtext('Cote') - doc_class = DOCUMENT_CLASS[doc_type] + code = document.findtext('Cote') + doc_class = DOCUMENT_CLASS[doc_type] doc, c = doc_class.objects.get_or_create(old_id=record_no, code=code) # Title @@ -125,10 +168,10 @@ class Command(BaseCommand): doc.save() # Keywords for keyword in document.findall('Mots-cles'): - keyword_obj, keyword_c = Keyword.objects.get_or_create( - name=keyword.text) - doc.keywords.add(keyword_obj) - + keyword_obj, keyword_c = Keyword.objects.get_or_create( + name=keyword.text) + doc.keywords.add(keyword_obj) + if doc_type == 'a-Notice spectacle': event_type = document.findtext('Type_Manifestation') @@ -144,11 +187,11 @@ class Command(BaseCommand): name=event_venue) else: event_venue_obj = None - + event = document.findtext('Festival_et_Manifestation') if event is not None: event_obj, c = Event.objects.get_or_create(name=event) - + edition = document.findtext('No_edition') try: event_edition_obj, c = EventEdition.objects.get_or_create( @@ -163,32 +206,29 @@ class Command(BaseCommand): doc.event_edition = event_edition_obj doc.event_type = event_type_obj doc.event_venue = event_venue_obj - - - import datetime try: release_date = datetime.datetime.strptime( - document.find('Date_de_parution').text,'%d/%m/%y').date() + document.find('Date_de_parution').text, '%d/%m/%y').date() except ValueError: - #if document.find('Date_de_parution').text == '2015/09/08': + # if document.find('Date_de_parution').text == '2015/09/08': # release_date = datetime.datetime.strptime('08/09/2015','%d/%m/%y').date() release_date = None erreur_date_parution += 1 try: - indexation_date = datetime.datetime.strptime( - document.find('Date_d_indexation').text,'%d/%m/%y').date() + indexation_date = datetime.datetime.strptime( + document.find('Date_d_indexation').text, '%d/%m/%y').date() except ValueError: indexation_date = None - erreur_date_indexation +=1 - - ## print '---------' - ## print record_no - ## print code - ## print title - ## print release_date - ## print indexation_date + erreur_date_indexation += 1 + + # print '---------' + # print record_no + # print code + # print title + # print release_date + # print indexation_date # Authors for author in document.findall('auteurs'): @@ -201,14 +241,14 @@ class Command(BaseCommand): ref_obj, ref_c = Reference.objects.get_or_create( name=ref.text) doc.references.add(ref_obj) - # GeographicalClassification + #  GeographicalClassification geo = document.findtext('Classement_Geographique') if geo is not None: - geo_obj,c = GeographicalClassification.objects.get_or_create( + geo_obj, c = GeographicalClassification.objects.get_or_create( name=geo) doc.geographic_classification = geo_obj doc.save() - + if DEBUG & (document_traite > 100): break print '-*-*--*-*-*-*-*-*-*-*' @@ -216,7 +256,3 @@ class Command(BaseCommand): print 'document_non_traité : %d' % document_non_traite print 'erreur_date_parution : %d' % erreur_date_parution print 'erreur_date_indexation : %d' % erreur_date_indexation - - - -