import numpy as np import matplotlib.pyplot as plt import nltk from nltk.corpus import brown ################################# ## ## Exercise set 2 ## ################################# print("\nExercise 1") print('******************') with open("crisis.txt", 'r') as f: raw = f.read() crisis_tokens = nltk.word_tokenize(raw) print("There are {} tokens and {} different types in the text.".format( len(crisis_tokens), len(set(crisis_tokens))) ) print("\nExercise 1b") print('******************') words = [] numbers = [] puncts = [] others = [] for t in crisis_tokens: if t.isalpha(): words.append(t) elif t.isnumeric(): numbers.append(t) elif t in ".,;:?!'-" or t in '"': puncts.append(t) else: others.append(t) print("Number of words: ", len(words)) print("Number of numbers: ", len(numbers)) print("Number of punctuation signs: ", len(puncts)) print("Number of other tokens: ", len(others)) print("\nExercise 1c") print('******************') sents = nltk.sent_tokenize(raw) print("There are {} many sentences.".format(len(sents))) print("\nExercise 1d") print('******************') paragraphs = raw.split("\n") non_empty_paragraphs = [p for p in paragraphs if p != ''] sents2 = [s for p in non_empty_paragraphs for s in nltk.sent_tokenize(p)] print("After splitting into paragraphs there are {} many sentences.".format( len(sents2))) print("One of the sentences is still split into two.") print("\nExercise 1e and f") print('******************') tokenized = [nltk.word_tokenize(s) for s in sents2] cleaned = [[w for w in s if not(w in ".,;:?!'-" or t in '"')] for s in tokenized] number_of_words = sum(len(s) for s in cleaned) ave_length = number_of_words/len(cleaned) print("The average sentence length is {}".format(round(ave_length, 3))) print("\nExercise 2a") print('******************') uni_tag_words = [x for x in brown.tagged_words(tagset='universal')] # x is a pair ow word and tag uni_tag_freq = nltk.FreqDist([t for w,t in uni_tag_words]) # coubt the number of occurrences of each tag. for t in uni_tag_freq: print("{:7}{:10}".format(t, uni_tag_freq[t])) print("\nExercise 2b") print('******************') uni_distr = nltk.ConditionalFreqDist(uni_tag_words) # For each word this gives a FreqDist over the tags assigned to the word. number_of_tags = {w : len(uni_distr[w]) for w in uni_distr} # A FreqDist which to each word gives its number of different tags. freq_freqs = nltk.FreqDist([number_of_tags[w] for w in number_of_tags]) # The frequency of frequensis of tags assiged to a word. for numb in sorted(freq_freqs): print("{:10} words have {} different tags".format(freq_freqs[numb],numb)) m = max(freq_freqs) print("\nThe following words occur with {} different tags:".format(m)) for w in number_of_tags: if number_of_tags[w] == m: print(w) print("\nExercise 3a") print('******************') full_tag_words = [x for x in brown.tagged_words()] full_tags = nltk.FreqDist([t for w,t in full_tag_words]) print("There are {} many different tags.".format(len(full_tags))) print("\nExercise 3b") print('******************') def transform(tagged_words): """Normalize a tagged sequence of word.""" new = [] for pair in tagged_words: word, tag = pair if tag.startswith('FW-'): tag = tag[3:] tag = tag.split('-')[0] if tag.endswith('*') and len(tag)>1: tag = tag[:-1]+'+*' tags = tag.split('+') for t in tags: new.append((word,t)) new.reverse() return new transformed = transform(full_tag_words) print("\nExercise 3c") print('******************') full_tag_freq = nltk.FreqDist([t for w,t in transformed]) for t in sorted(full_tag_freq): print("{:7}{:10}".format(t, full_tag_freq[t])) print("\nThere were originally {} many tag tokens".format(len(full_tag_words))) print("There are {} many tag tokens after transformation".format(len(transformed))) print("There are now {} different tags".format(len(full_tag_freq))) print("\nExercise 3d") print('******************') # This goes like (2b) full_distr = nltk.ConditionalFreqDist(transformed) number_of_tags = {w : len(full_distr[w]) for w in full_distr} tag_freqs = nltk.FreqDist([number_of_tags[w] for w in number_of_tags]) for numb in sorted(tag_freqs): print("{:10} words have {} different tags".format(tag_freqs[numb],numb)) m = max(freq_freqs) print("\nThe following words occur with {} different tags:".format(m)) for w in number_of_tags: if number_of_tags[w] == m: print(w) print("\nThe following words occur with {} different tags:".format(m-1)) for w in number_of_tags: if number_of_tags[w] == m-1: print(w) # Exercise 4 # Alt. 1 def to_arrays(fd): return np.array(list(fd.keys())), np.array(list(fd.values())) # Alt. 2 def fd_to_np_arrays(fd): keys, values = zip(*list(fd.items())) return np.array(keys), np.array(values) def to_arrays_keys(fd): sorted_keys = sorted(list(fd.keys())) sorted_values = [fd[k] for k in sorted_keys] return np.array(sorted_keys), np.array(sorted_values) def to_arrays_values(fd): ordered = sorted([b,a] for a,b in fd.items()) values, keys = zip(*ordered) return np.array(keys), np.array(values)