To perform the 2nd task you need to

  • install NLTK (Natural Language Tool Kit)
  • use the "command pip install nltk"
  • you may don't have the program 'pip', you should install it using 'sudo apt install python3-pip'

Tokenization

# import the existing word and sentence tokenizing
# libraries
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Natural language processing (NLP) is a field " + \
    "of computer science, artificial intelligence " + \
    "and computational linguistics concerned with " + \
    "the interactions between computers and human " + \
    "(natural) languages, and, in particular, " + \
    "concerned with programming computers to " + \
    "fruitfully process large natural language " + \
    "corpora. Challenges in natural language " + \
    "processing frequently involve natural " + \
    "language understanding, natural language" + \
    "generation frequently from formal, machine" + \
    "-readable logical forms), connecting language " + \
    "and machine perception, managing human-" + \
    "computer dialog systems, or some combination " + \
    "thereof."
print(sent_tokenize(text))
print(word_tokenize(text))`


Lemmatization


import nltk
nltk.download('wordnet')


# import these modules
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))

# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos="a"))


Stemming

# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]

for w in words:
    print(w, " : ", ps.stem(w))


Arabic stemming

Farasa is an Arabic NLP toolkit serving the following tasks:

  1. Segmentation.
  2. Stemming.
  3. Part Of Speech tagging (POS tagging).
  4. Diacritization.
To install it use 'pip install farasapy'

Stemming

from farasa.stemmer import FarasaStemmer
stemmer = FarasaStemmer()

text = 'أعلنت جامعة سعيدة تنظيمها مسابقة في الدكتوراه'

stemmed_text = stemmer.stem(text) print(stemmed_text)


POStagging


from farasa.pos import FarasaPOSTagger

# Initialize FarasaPOS
farasa_pos = FarasaPOSTagger()

# Example text
text = 'أعلنت جامعة سعيدة تنظيمها مسابقة في الدكتوراه'

# Perform POS tagging
pos_tags = farasa_pos.tag(text)

# Print POS tags
print(pos_tags)

Diacritization
from farasa.diacratizer import FarasaDiacritizer

# Initialize FarasaDiacritizer
farasa_diacritizer = FarasaDiacritizer()

# Example text
text = 'أعلنت جامعة سعيدة تنظيمها مسابقة في الدكتوراه'

# Perform diacritization
diacritized_text = farasa_diacritizer.diacritize(text)

# Print diacritized text
print(diacritized_text)




Modifié le: lundi 13 mai 2024, 07:34