Rest of lab sheet 2
To perform the 2nd task you need to
- install NLTK (Natural Language Tool Kit)
- use the "command pip install nltk"
- you may don't have the program 'pip', you should install it using 'sudo apt install python3-pip'
Tokenization
# import the existing word and sentence tokenizing
# libraries
from nltk.tokenize import sent_tokenize, word_tokenize
text = "Natural language processing (NLP) is a field " + \
"of computer science, artificial intelligence " + \
"and computational linguistics concerned with " + \
"the interactions between computers and human " + \
"(natural) languages, and, in particular, " + \
"concerned with programming computers to " + \
"fruitfully process large natural language " + \
"corpora. Challenges in natural language " + \
"processing frequently involve natural " + \
"language understanding, natural language" + \
"generation frequently from formal, machine" + \
"-readable logical forms), connecting language " + \
"and machine perception, managing human-" + \
"computer dialog systems, or some combination " + \
"thereof."
print(sent_tokenize(text))
print(word_tokenize(text))`
Lemmatization
import nltk
nltk.download('wordnet')
# import these modules
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos="a"))
Stemming
# import these modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]
for w in words:
print(w, " : ", ps.stem(w))
Arabic stemming
Farasa is an Arabic NLP toolkit serving the following tasks:
- Segmentation.
- Stemming.
- Part Of Speech tagging (POS tagging).
- Diacritization.
Stemming
from farasa.stemmer import FarasaStemmer
stemmer = FarasaStemmer()
text = 'أعلنت جامعة سعيدة تنظيمها مسابقة في الدكتوراه'
stemmed_text = stemmer.stem(text) print(stemmed_text)
POStagging
# Initialize FarasaPOS
farasa_pos = FarasaPOSTagger()
# Example text
text = 'أعلنت جامعة سعيدة تنظيمها مسابقة في الدكتوراه'
# Perform POS tagging
pos_tags = farasa_pos.tag(text)
# Print POS tags
print(pos_tags)
Diacritization
from farasa.diacratizer import FarasaDiacritizer# Initialize FarasaDiacritizer
farasa_diacritizer = FarasaDiacritizer()
# Example text
text = 'أعلنت جامعة سعيدة تنظيمها مسابقة في الدكتوراه'
# Perform diacritization
diacritized_text = farasa_diacritizer.diacritize(text)
# Print diacritized text
print(diacritized_text)