rougail/creole/lint/normalize.py

77 lines
2.5 KiB
Python

# coding: utf-8
import re
import unicodedata
from entities import entities
# ______________________________________________________________________________
ENCODING = 'utf-8'
def strip_accents(string):
return unicodedata.normalize('NFKD', unicode(string, ENCODING)
).encode('ASCII', 'ignore')
def normalize_entities():
"""
enleve les accents de la liste des entites + minuscules
:return: entities normalisé
"""
norm_entities = []
for entitie in entities:
norm_entitie = strip_accents(entitie).lower()
norm_entities.append(norm_entitie)
return norm_entities
NORM_ENTITIES = normalize_entities()
# ______________________________________________________________________________
def parse_string(text):
"""
enlève les accents d'un texte
"""
# libelle = strip_accents(text)
words = re.findall('([a-zA-Zéèàùêôëö_]+)', text)
return words
def is_in_entities(text):
"""
donne l'index dans entities du texte
"""
norm_text = text.lower()
index = None
if norm_text in NORM_ENTITIES:
index = NORM_ENTITIES.index(norm_text)
return index
def is_correct(libelle, name, family=False):
if libelle is not None and type(libelle) != str:
libelle = unicode.encode(libelle, ENCODING)
ret = []
if libelle == '' or libelle is None:
return ret
if libelle[0].islower():
#FIXME: faux positifs connus
if not libelle.startswith('ejabberd') and \
not libelle.startswith('phpMyAdmin'):
ret.append('%%%%%s : phrase sans majuscule'%name)
for text in parse_string(libelle):
text_index = is_in_entities(text)
if not text_index == None:
if str(text) != str(entities[text_index]):
#FIXME: faux positifs connus
if 'ipsec.conf' in libelle or 'test-rvp' in libelle \
or 'bareos-' in libelle \
or 'bacula-' in libelle \
or '/var/log/zephir' in libelle \
or 'exemple : eolebase' in libelle:
continue
ent = str(unicode.encode((unicode(entities[text_index], ENCODING)), ENCODING))
if family:
ret.append('famille [%s] : %s => %s' % (str(name), text, ent))
else:
ret.append('%%%%%s : %s => %s' % (str(name), text, ent))
return ret
# ______________________________________________________________________________