SPPAS 4.20

Module sppas.src.resources

Class sppasWordStrain

Description

Sort of basic lemmatization.

Constructor

Create a WordStain instance.

Parameters
  • filename: (str) 2 or 3 columns file with word/freq/wordstrain
View Source
def __init__(self, filename=None):
    """Create a WordStain instance.

    :param filename: (str) 2 or 3 columns file with word/freq/wordstrain

    """
    super(sppasWordStrain, self).__init__(dict_filename=None, nodump=True)
    self.load(filename)

Public functions

load

Load word substitutions from a file.

Replace the existing substitutions.

Parameters
  • filename: (str) 2 or 3 columns file with word/freq/replacement
View Source
def load(self, filename):
    """Load word substitutions from a file.

        Replace the existing substitutions.

        :param filename: (str) 2 or 3 columns file with word/freq/replacement

        """
    if filename is None:
        return
    with codecs.open(filename, 'r', sg.__encoding__) as fd:
        try:
            line = fd.readline()
        except UnicodeDecodeError:
            raise FileUnicodeError(filename=filename)
        fd.close()
    content = line.split()
    if len(content) < 3:
        self.load_from_ascii(filename)
    else:
        self.__load_with_freq(filename)

Protected functions

__load_with_freq

Load a replacement dictionary from a 3-columns ascii file.

Parameters
  • filename: (str) Replacement dictionary file name
View Source
def __load_with_freq(self, filename):
    """Load a replacement dictionary from a 3-columns ascii file.

        :param filename: (str) Replacement dictionary file name

        """
    with codecs.open(filename, 'r', sg.__encoding__) as fd:
        try:
            lines = fd.readlines()
        except UnicodeDecodeError:
            raise FileUnicodeError(filename=filename)
        fd.close()
    self.__filename = filename
    frequency = {}
    for line in lines:
        line = ' '.join(line.split())
        if len(line) == 0:
            continue
        tab_line = line.split()
        if len(tab_line) < 2:
            continue
        key = tab_line[0].lower()
        freq = int(tab_line[1])
        value = sppasDictRepl.REPLACE_SEPARATOR.join(tab_line[2:])
        if key in frequency:
            if freq > frequency[key]:
                frequency[key] = freq
                self.pop(key)
                self.add(key, value)
        else:
            frequency[key] = freq
            self.add(key, value)