From: Thomas Fillon <thomas@parisson.com>
Date: Sat, 25 Feb 2017 14:46:21 +0000 (+0100)
Subject: Update management commend for importing MCM Thesaurus
X-Git-Url: https://git.parisson.com/?a=commitdiff_plain;h=bf7eb1d6e46bfc19d471467bc2ea2079ee5723c8;p=telemeta_mcm.git

Update management commend for importing MCM Thesaurus
---

diff --git a/mcm/management/commands/import_xml.py b/mcm/management/commands/import_xml.py
index de7dbc4..a0f05e2 100644
--- a/mcm/management/commands/import_xml.py
+++ b/mcm/management/commands/import_xml.py
@@ -4,7 +4,7 @@ from django.core.management.base import BaseCommand, CommandError
 
 from ...models import Document
 from ...models import Notice, Disc, Video, VideoFile, BookThesis, Journal
-from ...models import Article, Photo, PosterBooklet, Object 
+from ...models import Article, Photo, PosterBooklet, Object
 from ...models import Author, Keyword, Reference
 from ...models import Event, EventEdition, EventType, EventVenue
 from ...models import GeographicalClassification
@@ -12,44 +12,90 @@ from ...models import GeographicalClassification
 #import lxml.etree.ElementTree as ET
 import xml.etree.ElementTree as ET
 import os
+import logging
+import tempfile
+
+import HTMLParser
+
 
 DEBUG = False
 
-replacements = {'auteur_affiche_(dessin)>': 'auteur_affiche_dessin>',
-                '[record_no]>': 'record_no>',
-                '<': '???<',
-                '&<': '?<',
-                '\00': '',
-                '\373': 'Ã»',
-                'sa lta nata': 'saltanata',
-                'Gugak FM': 'Gugak FM',
-                '\xf5': '&#305;',
-                '\x1e': '',
-                '\x02': '',
-                '\xf9': '?',
-                '\xb7': '?'}
+replacements_tag = {
+    'auteur_affiche_(dessin)>': 'auteur_affiche_dessin>',
+    'auteur_affiche_(dessin).Documents>': 'auteur_affiche_dessin_Documents>',
+    '[record_no]>': 'record_no>',
+    '->_Intervention>': 'Auteurs_intervention>',
+    #'\00': '',
+}
+
+replacements_char = {
+    '<': '???<',
+    '&#146;': '&apos;',
+    '&#156;': '&oelig;',
+    '&<': '?<',
+    '\373': '&ucirc;',
+    'sa lta nata': 'saltanata',
+    'Gugak FM': 'Gugak FM',
+    '\xf5': '&#305;',
+    '\x1e': '',
+    '\x02': '',
+    '\xf9': '?',
+    '\xb7': '?',
+    '\371': '',
+}
+
 
 def cleanup_xml(xml_file):
     root, ext = os.path.splitext(xml_file)
-    clean_xml_file = ''.join([root,'_clean', ext])
-    if os.path.exists(clean_xml_file):
-        return clean_xml_file
-    line_err =  None 
-    with open(xml_file, 'U') as infile, open(clean_xml_file, 'w') as outfile:
-        l = 1
-        for line in infile:
-            if l == line_err:
-                print repr(line)
-            for src, target in replacements.iteritems():
-                line = line.replace(src, target)
-                #if line.startswith('Chant et fl&#251;te taegu'):
-                #    print repr(line)
-            outfile.write(line)
-            l += 1
+    clean_xml_file = ''.join([root, '_clean', ext])
+    log_file = ''.join([root, '_clean_log.txt'])
+    if os.path.exists(log_file):
+        os.unlink(log_file)
+    logging.basicConfig(filename=log_file, format='%(levelname)s:%(message)s',
+                        level=logging.DEBUG)
+    logging.info('Nettoyage du fichier XML %s', xml_file)
+    # if os.path.exists(clean_xml_file):
+    #    return clean_xml_file
+    h = HTMLParser.HTMLParser()
+    temp_xml = tempfile.NamedTemporaryFile(delete=False)
+    # 1ere passe : nettoyage des tags XML
+    with open(xml_file, 'U') as infile:
+        with open(temp_xml.name, 'w') as outfile:
+            for line in infile:
+                for src, target in replacements_tag.iteritems():
+                    if src in line:
+                        line = line.replace(src, target)
+                outfile.write(line)
+
+    # 2nde passe : nettoyage des caractÃ¨res
+    with open(temp_xml.name, 'U') as infile:
+        with open(clean_xml_file, 'w') as outfile:
+            lineno = 1
+            for line in infile:
+                change_line = False
+                for src, target in replacements_char.iteritems():
+                    if src in line:
+                        change_line = True
+                        logging.info('Ligne : %d', lineno)
+                        logging.info('%s -> %s', src, target)
+                        logging.info('Ligne de texte originale: %s', line)
+                        line = line.replace(src, target)
+                        logging.info('Ligne de texte de remplacement : %s', line)
+                try:
+                    line = h.unescape(line)
+                except UnicodeDecodeError as e:
+                    print line
+                    raise e
+                if change_line:
+                    logging.info('Ligne de texte de remplacement HTML : %s', line)
+                outfile.write(line.encode('utf-8'))
+
+                lineno += 1
+    os.unlink(temp_xml.name)
 
     return clean_xml_file
 
-    
+
 class Command(BaseCommand):
     help = 'Import items from XML'
 
@@ -71,7 +117,6 @@ class Command(BaseCommand):
         erreur_date_parution = 0
         erreur_date_indexation = 0
 
-        
         # <Type>a-Notice spectacle</Type>
         # <Type>b-Disque</Type>
         # <Type>c-VidÃ©o DVD&#38;VHS</Type>
@@ -84,19 +129,17 @@ class Command(BaseCommand):
         # <Type>k-P&#233;dagogique</Type>
         # <Type>l-Objet</Type>
 
-        DOCUMENT_CLASS  = {'a-Notice spectacle': Notice,
-                           'b-Disque': Disc,
-                           u'c-VidÃ©o DVD&VHS': Video,
-                           u'd-VidÃ©o en ligne': VideoFile,
-                           u'f-Ouvrage & ThÃ¨se': BookThesis,
-                           'g-Revue': Journal,
-                           'h-Article': Article,
-                           'i-Photo': Photo,
-                           'j-Affiche - Brochure': PosterBooklet,
-                           'l-Objet': Object
-                           }
-
-
+        DOCUMENT_CLASS = {'a-Notice spectacle': Notice,
+                          'b-Disque': Disc,
+                          u'c-VidÃ©o DVD&VHS': Video,
+                          u'd-VidÃ©o en ligne': VideoFile,
+                          u'f-Ouvrage & ThÃ¨se': BookThesis,
+                          'g-Revue': Journal,
+                          'h-Article': Article,
+                          'i-Photo': Photo,
+                          'j-Affiche - Brochure': PosterBooklet,
+                          'l-Objet': Object
+                          }
 
         skip_document_types = ['e-Site Internet', 'l-Objet', u'k-PÃ©dagogique']
 
@@ -104,20 +147,20 @@ class Command(BaseCommand):
 
         import HTMLParser
         h = HTMLParser.HTMLParser()
-                
+
         for document in root.iter('Document'):
-            #print '------------'
+            # print '------------'
             doc_type = h.unescape(document.findtext('Type'))
-            #print doc_type
+            # print doc_type
             if doc_type in skip_document_types:
                 if doc_type == 'l-Objet':
                     document_non_traite += 1
                 continue
 
-            document_traite +=1
+            document_traite += 1
             record_no = document.findtext('record_no')
-            code =  document.findtext('Cote')
-            doc_class = DOCUMENT_CLASS[doc_type]     
+            code = document.findtext('Cote')
+            doc_class = DOCUMENT_CLASS[doc_type]
             doc, c = doc_class.objects.get_or_create(old_id=record_no,
                                                      code=code)
             # Title
@@ -125,10 +168,10 @@ class Command(BaseCommand):
             doc.save()
             # Keywords
             for keyword in document.findall('Mots-cles'):
-                    keyword_obj, keyword_c = Keyword.objects.get_or_create(
-                            name=keyword.text)
-                    doc.keywords.add(keyword_obj)
-            
+                keyword_obj, keyword_c = Keyword.objects.get_or_create(
+                    name=keyword.text)
+                doc.keywords.add(keyword_obj)
+
             if doc_type == 'a-Notice spectacle':
 
                 event_type = document.findtext('Type_Manifestation')
@@ -144,11 +187,11 @@ class Command(BaseCommand):
                         name=event_venue)
                 else:
                     event_venue_obj = None
-                    
+
                 event = document.findtext('Festival_et_Manifestation')
                 if event is not None:
                     event_obj, c = Event.objects.get_or_create(name=event)
-                    
+
                     edition = document.findtext('No_edition')
                     try:
                         event_edition_obj, c = EventEdition.objects.get_or_create(
@@ -163,32 +206,29 @@ class Command(BaseCommand):
                 doc.event_edition = event_edition_obj
                 doc.event_type = event_type_obj
                 doc.event_venue = event_venue_obj
- 
 
-                
- 
                 import datetime
                 try:
                     release_date = datetime.datetime.strptime(
-                        document.find('Date_de_parution').text,'%d/%m/%y').date()
+                        document.find('Date_de_parution').text, '%d/%m/%y').date()
                 except ValueError:
-                    #if document.find('Date_de_parution').text == '2015/09/08':
+                    # if document.find('Date_de_parution').text == '2015/09/08':
                     #    release_date = datetime.datetime.strptime('08/09/2015','%d/%m/%y').date()
                     release_date = None
                     erreur_date_parution += 1
                 try:
-                   indexation_date = datetime.datetime.strptime(
-                       document.find('Date_d_indexation').text,'%d/%m/%y').date()
+                    indexation_date = datetime.datetime.strptime(
+                        document.find('Date_d_indexation').text, '%d/%m/%y').date()
                 except ValueError:
                     indexation_date = None
-                    erreur_date_indexation +=1 
-                
-                ## print '---------'
-                ## print record_no
-                ## print code
-                ## print title
-                ## print release_date
-                ## print indexation_date
+                    erreur_date_indexation += 1
+
+                # print '---------'
+                # print record_no
+                # print code
+                # print title
+                # print release_date
+                # print indexation_date
 
                 # Authors
                 for author in document.findall('auteurs'):
@@ -201,14 +241,14 @@ class Command(BaseCommand):
                     ref_obj, ref_c = Reference.objects.get_or_create(
                         name=ref.text)
                     doc.references.add(ref_obj)
-                #Â GeographicalClassification
+                # Â GeographicalClassification
                 geo = document.findtext('Classement_Geographique')
                 if geo is not None:
-                    geo_obj,c = GeographicalClassification.objects.get_or_create(
+                    geo_obj, c = GeographicalClassification.objects.get_or_create(
                         name=geo)
                     doc.geographic_classification = geo_obj
                 doc.save()
-            
+
             if DEBUG & (document_traite > 100):
                 break
         print '-*-*--*-*-*-*-*-*-*-*'
@@ -216,7 +256,3 @@ class Command(BaseCommand):
         print 'document_non_traitÃ© : %d' % document_non_traite
         print 'erreur_date_parution : %d' % erreur_date_parution
         print 'erreur_date_indexation : %d' % erreur_date_indexation
-            
-
-
-