From 5fe645cc0a1b84a70c40c856364006ef73992655 Mon Sep 17 00:00:00 2001 From: Emilie Zawadzki Date: Mon, 14 Oct 2019 13:18:54 +0200 Subject: [PATCH] Latin1 to UTF8 converter and tester script. WIP --- misc/sql/latin1_to_utf8.py | 31 +++++++++ misc/sql/latin1_to_utf8_tester.py | 107 ++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100755 misc/sql/latin1_to_utf8.py create mode 100755 misc/sql/latin1_to_utf8_tester.py diff --git a/misc/sql/latin1_to_utf8.py b/misc/sql/latin1_to_utf8.py new file mode 100755 index 0000000..97f1bc1 --- /dev/null +++ b/misc/sql/latin1_to_utf8.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3.5 + +import os +from chardet import detect +from pyutil import filereplace +srcfile = "./var/backup/mariadb.dump" +trgfile = "./var/backup/mariadb2.dump" + +# get file encoding type +def get_encoding_type(file): + with open(file, 'rb') as f: + rawdata = f.read() + return detect(rawdata)['encoding'] + +from_codec = get_encoding_type(srcfile) + +# add try: except block for reliability +try: + with open(srcfile, 'r', encoding=from_codec) as f, open(trgfile, 'w', encoding='utf-8') as e: + text = f.read() # for small files, for big use chunks + e.write(text) + + os.remove(srcfile) # remove old encoding file + os.rename(trgfile, srcfile) # rename new encoding +except UnicodeDecodeError: + print('Decode Error') +except UnicodeEncodeError: + print('Encode Error') + +# replace charset latin1 to ut8 +filereplace(srcfile,"CHARSET=latin1","CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci") \ No newline at end of file diff --git a/misc/sql/latin1_to_utf8_tester.py b/misc/sql/latin1_to_utf8_tester.py new file mode 100755 index 0000000..6aa629d --- /dev/null +++ b/misc/sql/latin1_to_utf8_tester.py @@ -0,0 +1,107 @@ +#!/usr/bin/python + +import pandas as pd +import requests +from subprocess import check_output, CalledProcessError +domain_prod = "http://brahms.ircam.fr" +domain_dev = "http://localhost:9030" +session_option = "-H 'Cookie: sessionid={0}'" +session_prod = "gd9zgu68wo3lptwfhqqkawo59w0486rz" +session_dev = "qh25niyvxetuouxwmafdpvxlc8eldp5u" + +analyse_urls = [ + # analyse - published + "/analyses/dialogue/", + "/analyses/metallics/", + "/analyses/noanoa/", + "/analyses/traiettoria/", + "/analyses/Mortuos/", + "/analyses/Stria/", + "/analyses/Prologue/", + # analyse - draft + "/analyses/Etymo/", + "/analyses/EnTrance/", + "/analyses/test/", + "/analyses/test2/", + # composer + "/witold-lutoslawski", + "/wlodzimierz-kotonski", + "/omer-hulusier", + "/necil-kazim-akses", + "/wenjing-guo", + "/rene-alix", + "/axel-borup-jrgensen", + "/per-nrgard", + "/pierre-boulez", + # events_event + "/admin/events/event/13/change/", + "/admin/events/event/23/change/", + #events_hall + "/admin/events/hall/5/change/", + #events_manifestation + "/admin/events/manifestation/3/change/", + #repertoire_analysis_definitions + "/admin/repertoire/definition/32/change/", + #robots_rule + "/admin/robots/rule/1/change/", + #robots_url + "/admin/robots/url/3/change/", + #utils_citysidney + "/admin/utils/citysidney/3/change/", + #utils_corporatebody + "/admin/utils/corporatebody/127/change/", + #utils_countrysidney + "/admin/utils/countrysidney/230/change/", + #utils_equipmentbrand + "/admin/utils/equipmentbrand/127/change/", + #utils_equipmentcategory + "admin/utils/equipmentcategory/4/change/", + #utils_equipmentreference + "/admin/utils/equipmentreference/422/change/", + #utils_error + "/admin/utils/error/1703/change/", + #utils_lang + "/admin/utils/lang/1/change/", + #utils_naturalperson + "/admin/utils/naturalperson/6791/change/", + "/admin/utils/naturalperson/6806/change/", + "/admin/utils/naturalperson/6892/change/", + #utils_personfunction + "/admin/utils/personfunction/12/change/", + #validation_fichedevalidation + "/admin/validation/fichedevalidation/74/change/", + "/admin/validation/fichedevalidation/63/change/", + #works_electronic + "/admin/works/electronic/11/change/", + "/admin/works/electronic/5/change/", + #works_filetype + "/admin/works/filetype/20/change/", + #works_version + "/admin/works/version/1345/change/", + "/admin/works/version/990/change/", + #works_versionfile + "/admin/works/versionfile/2665/change/", + "/admin/works/versionfile/9250/change/", + #works_worksidney + "/admin/works/worksidney/6970/change/", + "/admin/works/worksidney/18566/change/", + "/admin/works/worksidney/7549/change/", + "/admin/works/worksidney/25673/change/", + "/admin/works/worksidney/10575/change/", +] + +def get_curl_command(domain, url, session): + return "curl -s '{0}' {1}".format(domain + url, session_option.format(session)) + +for url in analyse_urls: + print("==========================================================================") + print("url ", url) + print("-----------------------------------------") + try: + curl_prod = get_curl_command(domain_prod, url, session_prod) + curl_dev = get_curl_command(domain_dev, url, session_dev) + check_output("diff <("+curl_prod+") <("+ curl_dev + ")", + shell=True, executable='/bin/bash', universal_newlines=True) + except CalledProcessError as e: + print(e.output, e.returncode) + -- 2.39.5