blob: 3eb56c516e04e108642b069ede81d141f7c705b0 [file] [log] [blame]
# -*- coding: utf-8 -*-
import csv
from log_util import LogUtil
from singleton import Singleton
@Singleton
class LabelUtil:
_log = None
# dataPath
def __init__(self):
self._log = LogUtil().getlogger()
self._log.debug("LabelUtil init")
def load_unicode_set(self, unicodeFilePath):
self.byChar = {}
self.byIndex = {}
self.unicodeFilePath = unicodeFilePath
with open(unicodeFilePath) as data_file:
data_file = csv.reader(data_file, delimiter=',')
self.count = 0
for r in data_file:
self.byChar[r[0]] = int(r[1])
self.byIndex[int(r[1])] = r[0]
self.count += 1
def to_unicode(self, src, index):
# 1 byte
code1 = int(ord(src[index + 0]))
index += 1
result = code1
return result, index
def convert_word_to_grapheme(self, label):
result = []
index = 0
while index < len(label):
(code, nextIndex) = self.to_unicode(label, index)
result.append(label[index])
index = nextIndex
return result, "".join(result)
def convert_word_to_num(self, word):
try:
label_list, _ = self.convert_word_to_grapheme(word)
label_num = []
for char in label_list:
# skip word
if char == "":
pass
else:
label_num.append(int(self.byChar[char]))
# tuple typecast: read only, faster
return tuple(label_num)
except AttributeError:
self._log.error("unicodeSet is not loaded")
exit(-1)
except KeyError as err:
self._log.error("unicodeSet Key not found: %s" % err)
exit(-1)
def convert_bi_graphemes_to_num(self, word):
label_num = []
for char in word:
# skip word
if char == "":
pass
else:
label_num.append(int(self.byChar[char]))
# tuple typecast: read only, faster
return tuple(label_num)
def convert_num_to_word(self, num_list):
try:
label_list = []
for num in num_list:
label_list.append(self.byIndex[num])
return ''.join(label_list)
except AttributeError:
self._log.error("unicodeSet is not loaded")
exit(-1)
except KeyError as err:
self._log.error("unicodeSet Key not found: %s" % err)
exit(-1)
def get_count(self):
try:
return self.count
except AttributeError:
self._log.error("unicodeSet is not loaded")
exit(-1)
def get_unicode_file_path(self):
try:
return self.unicodeFilePath
except AttributeError:
self._log.error("unicodeSet is not loaded")
exit(-1)
def get_blank_index(self):
return self.byChar["-"]
def get_space_index(self):
return self.byChar["$"]