tokenize
package
This module focuses on the tokenization of both Kurmanji and Sorani dialects of Kurdish with the following functions:
word_tokenize
: tokenization of texts into tokens (both multi-word expressions and single-word tokens).mwe_tokenize
: tokenization of texts by only taking compound forms into accountsent_tokenize
: tokenization of texts into sentences
The module is based on the Kurdish tokenization project.
Examples:
>>> from klpt.tokenize import Tokenize
>>> tokenizer = Tokenize("Kurmanji", "Latin")
>>> tokenizer.word_tokenize("ji bo fortê xwe avêtin")
['▁ji▁', 'bo', '▁▁fortê‒xwe‒avêtin▁▁']
>>> tokenizer.mwe_tokenize("bi serokê hukûmeta herêma Kurdistanê Prof. Salih re saz kir.")
'bi serokê hukûmeta herêma Kurdistanê Prof . Salih re saz kir .'
>>> tokenizer_ckb = Tokenize("Sorani", "Arabic")
>>> tokenizer_ckb.word_tokenize("بە هەموو هەمووانەوە ڕێک کەوتن")
['▁بە▁', '▁هەموو▁', 'هەمووانەوە', '▁▁ڕێک‒کەوتن▁▁']
mwe_tokenize(self, sentence, separator='▁▁', in_separator='‒', punct_marked=False, keep_form=False)
Multi-word expression tokenization
Parameters:
Name | Type | Description | Default |
---|---|---|---|
sentence |
str |
sentence to be split by multi-word expressions |
required |
separator |
str |
a specific token to specify a multi-word expression. By default two ▁ (▁▁) are used for this purpose. |
'▁▁' |
in_separator |
str |
a specific token to specify the composing parts of a multi-word expression. By default a dash - is used for this purpose. |
'‒' |
keep_form |
boolean |
if set to True, the original form of the multi-word expression is returned the same way provided in the input. On the other hand, if set to False, the lemma form is used where the parts are delimited by a dash ‒, as in "dab‒û‒nerît" |
False |
Returns:
Type | Description |
---|---|
str |
sentence containing d multi-word expressions using the separator |
Source code in klpt/tokenize.py
def mwe_tokenize(self, sentence, separator="▁▁", in_separator="‒", punct_marked=False, keep_form=False):
"""
Multi-word expression tokenization
Args:
sentence (str): sentence to be split by multi-word expressions
separator (str): a specific token to specify a multi-word expression. By default two ▁ (▁▁) are used for this purpose.
in_separator (str): a specific token to specify the composing parts of a multi-word expression. By default a dash - is used for this purpose.
keep_form (boolean): if set to True, the original form of the multi-word expression is returned the same way provided in the input. On the other hand, if set to False, the lemma form is used where the parts are delimited by a dash ‒, as in "dab‒û‒nerît"
Returns:
str: sentence containing d multi-word expressions using the separator
"""
sentence = " " + sentence + " "
if not punct_marked:
# find punctuation marks and add a space around
for punct in self.tokenize_map["word_tokenize"][self.dialect][self.script]["punctuation"]:
if punct in sentence:
sentence = sentence.replace(punct, " " + punct + " ")
# look for compound words and delimit them by double separator
for compound_lemma in self.mwe_lexicon:
compound_lemma_context = " " + compound_lemma + " "
if compound_lemma_context in sentence:
if keep_form:
sentence = sentence.replace(compound_lemma_context, " ▁▁" + compound_lemma + "▁▁ ")
else:
sentence = sentence.replace(compound_lemma_context, " ▁▁" + compound_lemma.replace("-", in_separator) + "▁▁ ")
# check the possible word forms available for each compound lemma in the lex files, too
# Note: compound forms don't have any hyphen or separator in the lex files
for compound_form in self.mwe_lexicon[compound_lemma]["token_forms"]:
compound_form_context = " " + compound_form + " "
if compound_form_context in sentence:
if keep_form:
sentence = sentence.replace(compound_form_context, " ▁▁" + compound_form + "▁▁ ")
else:
sentence = sentence.replace(compound_form_context, " ▁▁" + compound_lemma.replace("-", in_separator) + "▁▁ ")
# print(sentence)
return sentence.replace(" ", " ").replace("▁▁", separator).strip()
sent_tokenize(self, text)
Sentence tokenizer
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
[str] |
[input text to be tokenized by sentences] |
required |
Returns:
Type | Description |
---|---|
[list] |
[a list of sentences] |
Source code in klpt/tokenize.py
def sent_tokenize(self, text):
"""Sentence tokenizer
Args:
text ([str]): [input text to be tokenized by sentences]
Returns:
[list]: [a list of sentences]
"""
text = " " + text + " "
text = text.replace("\n", " ")
text = re.sub(self.prefixes, "\\1<prd>", text)
text = re.sub(self.websites, "<prd>\\1", text)
text = re.sub("\s" + self.alphabets + "[.] ", " \\1<prd> ", text)
text = re.sub(self.acronyms + " " + self.starters, "\\1<stop> \\2", text)
text = re.sub(self.alphabets + "[.]" + self.alphabets + "[.]" + self.alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
text = re.sub(self.alphabets + "[.]" + self.alphabets + "[.]", "\\1<prd>\\2<prd>", text)
text = re.sub(" " + self.suffixes + "[.] " + self.starters, " \\1<stop> \\2", text)
text = re.sub(" " + self.suffixes + "[.]", " \\1<prd>", text)
text = re.sub(self.digits + "[.]" + self.digits, "\\1<prd>\\2", text)
# for punct in self.tokenize_map[self.dialect][self.script]["compound_puncts"]:
# if punct in text:
# text = text.replace("." + punct, punct + ".")
for punct in self.tokenize_map["sent_tokenize"][self.dialect][self.script]["punct_boundary"]:
text = text.replace(punct, punct + "<stop>")
text = text.replace("<prd>", ".")
sentences = text.split("<stop>")
sentences = [s.strip() for s in sentences if len(s.strip())]
return sentences
word_tokenize(self, sentence, separator='▁', mwe_separator='▁▁', keep_form=False)
Word tokenizer
Parameters:
Name | Type | Description | Default |
---|---|---|---|
sentence |
str |
sentence or text to be tokenized |
required |
Returns:
Type | Description |
---|---|
[list] |
[a list of words] |
Source code in klpt/tokenize.py
def word_tokenize(self, sentence, separator="▁", mwe_separator="▁▁", keep_form=False):
"""Word tokenizer
Args:
sentence (str): sentence or text to be tokenized
Returns:
[list]: [a list of words]
"""
# find multi-word expressions in the sentence
sentence = self.mwe_tokenize(sentence, keep_form=keep_form)
# find punctuation marks and add a space around
for punct in self.tokenize_map["word_tokenize"][self.dialect][self.script]["punctuation"]:
if punct in sentence:
sentence = sentence.replace(punct, " " + punct + " ")
# print(sentence)
tokens = list()
# split the sentence by space and look for identifiable tokens
for word in sentence.strip().split():
if "▁▁" in word:
# the word is previously detected as a compound word
tokens.append(word)
else:
if word in self.lexicon:
# check if the word exists in the lexicon
tokens.append("▁" + word + "▁")
else:
# the word is neither a lemma nor a compound
# morphological analysis by identifying affixes and clitics
token_identified = False
for preposition in self.morphemes["prefixes"]:
if word.startswith(preposition) and len(word.split(preposition, 1)) > 1:
if word.split(preposition, 1)[1] in self.lexicon:
word = "▁".join(["", self.morphemes["prefixes"][preposition], word.split(preposition, 1)[1], ""])
token_identified = True
break
elif self.mwe_tokenize(word.split(preposition, 1)[1], keep_form=keep_form) != word.split(preposition, 1)[1]:
word = "▁" + self.morphemes["prefixes"][preposition] + self.mwe_tokenize(word.split(preposition, 1)[1], keep_form=keep_form)
token_identified = True
break
if not token_identified:
for postposition in self.morphemes["suffixes"]:
if word.endswith(postposition) and len(word.rpartition(postposition)[0]):
if word.rpartition(postposition)[0] in self.lexicon:
word = "▁" + word.rpartition(postposition)[0] + "▁" + self.morphemes["suffixes"][postposition]
break
elif self.mwe_tokenize(word.rpartition(postposition)[0], keep_form=keep_form) != word.rpartition(postposition)[0]:
word = ("▁" + self.mwe_tokenize(word.rpartition(postposition)[0], keep_form=keep_form) + "▁" + self.morphemes["suffixes"][postposition] + "▁").replace("▁▁▁", "▁▁")
break
tokens.append(word)
# print(tokens)
return " ".join(tokens).replace("▁▁", mwe_separator).replace("▁", separator).split()