import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import brown
#################################
##
## Exercise set 2
##
#################################
print("\nExercise 1")
print('******************')
with open("crisis.txt", 'r') as f:
raw = f.read()
crisis_tokens = nltk.word_tokenize(raw)
print("There are {} tokens and {} different types in the text.".format(
len(crisis_tokens), len(set(crisis_tokens)))
)
print("\nExercise 1b")
print('******************')
words = []
numbers = []
puncts = []
others = []
for t in crisis_tokens:
if t.isalpha(): words.append(t)
elif t.isnumeric(): numbers.append(t)
elif t in ".,;:?!'-" or t in '"': puncts.append(t)
else: others.append(t)
print("Number of words: ", len(words))
print("Number of numbers: ", len(numbers))
print("Number of punctuation signs: ", len(puncts))
print("Number of other tokens: ", len(others))
print("\nExercise 1c")
print('******************')
sents = nltk.sent_tokenize(raw)
print("There are {} many sentences.".format(len(sents)))
print("\nExercise 1d")
print('******************')
paragraphs = raw.split("\n")
non_empty_paragraphs = [p for p in paragraphs if p != '']
sents2 = [s for p in non_empty_paragraphs
for s in nltk.sent_tokenize(p)]
print("After splitting into paragraphs there are {} many sentences.".format(
len(sents2)))
print("One of the sentences is still split into two.")
print("\nExercise 1e and f")
print('******************')
tokenized = [nltk.word_tokenize(s) for s in sents2]
cleaned = [[w for w in s if not(w in ".,;:?!'-" or t in '"')]
for s in tokenized]
number_of_words = sum(len(s) for s in cleaned)
ave_length = number_of_words/len(cleaned)
print("The average sentence length is {}".format(round(ave_length, 3)))
print("\nExercise 2a")
print('******************')
uni_tag_words = [x for x in brown.tagged_words(tagset='universal')]
# x is a pair ow word and tag
uni_tag_freq = nltk.FreqDist([t for w,t in uni_tag_words])
# coubt the number of occurrences of each tag.
for t in uni_tag_freq:
print("{:7}{:10}".format(t, uni_tag_freq[t]))
print("\nExercise 2b")
print('******************')
uni_distr = nltk.ConditionalFreqDist(uni_tag_words)
# For each word this gives a FreqDist over the tags assigned to the word.
number_of_tags = {w : len(uni_distr[w]) for w in uni_distr}
# A FreqDist which to each word gives its number of different tags.
freq_freqs = nltk.FreqDist([number_of_tags[w] for w in number_of_tags])
# The frequency of frequensis of tags assiged to a word.
for numb in sorted(freq_freqs):
print("{:10} words have {} different tags".format(freq_freqs[numb],numb))
m = max(freq_freqs)
print("\nThe following words occur with {} different tags:".format(m))
for w in number_of_tags:
if number_of_tags[w] == m: print(w)
print("\nExercise 3a")
print('******************')
full_tag_words = [x for x in brown.tagged_words()]
full_tags = nltk.FreqDist([t for w,t in full_tag_words])
print("There are {} many different tags.".format(len(full_tags)))
print("\nExercise 3b")
print('******************')
def transform(tagged_words):
"""Normalize a tagged sequence of word."""
new = []
for pair in tagged_words:
word, tag = pair
if tag.startswith('FW-'): tag = tag[3:]
tag = tag.split('-')[0]
if tag.endswith('*') and len(tag)>1:
tag = tag[:-1]+'+*'
tags = tag.split('+')
for t in tags:
new.append((word,t))
new.reverse()
return new
transformed = transform(full_tag_words)
print("\nExercise 3c")
print('******************')
full_tag_freq = nltk.FreqDist([t for w,t in transformed])
for t in sorted(full_tag_freq):
print("{:7}{:10}".format(t, full_tag_freq[t]))
print("\nThere were originally {} many tag tokens".format(len(full_tag_words)))
print("There are {} many tag tokens after transformation".format(len(transformed)))
print("There are now {} different tags".format(len(full_tag_freq)))
print("\nExercise 3d")
print('******************')
# This goes like (2b)
full_distr = nltk.ConditionalFreqDist(transformed)
number_of_tags = {w : len(full_distr[w]) for w in full_distr}
tag_freqs = nltk.FreqDist([number_of_tags[w] for w in number_of_tags])
for numb in sorted(tag_freqs):
print("{:10} words have {} different tags".format(tag_freqs[numb],numb))
m = max(freq_freqs)
print("\nThe following words occur with {} different tags:".format(m))
for w in number_of_tags:
if number_of_tags[w] == m: print(w)
print("\nThe following words occur with {} different tags:".format(m-1))
for w in number_of_tags:
if number_of_tags[w] == m-1: print(w)
# Exercise 4
# Alt. 1
def to_arrays(fd):
return np.array(list(fd.keys())), np.array(list(fd.values()))
# Alt. 2
def fd_to_np_arrays(fd):
keys, values = zip(*list(fd.items()))
return np.array(keys), np.array(values)
def to_arrays_keys(fd):
sorted_keys = sorted(list(fd.keys()))
sorted_values = [fd[k] for k in sorted_keys]
return np.array(sorted_keys), np.array(sorted_values)
def to_arrays_values(fd):
ordered = sorted([b,a] for a,b in fd.items())
values, keys = zip(*ordered)
return np.array(keys), np.array(values)