rm(list=ls(all=TRUE)) # clear memory
source("http://www.linguistics.ucsb.edu/faculty/stgries/exact.matches.2.r")
entropy <- function (distribution, normalized=FALSE) {
entr <- -sum((temp <- distribution[distribution > 0]/sum(distribution)) * log2(temp))
if (normalized) { entr <- entr/log2(length(distribution)) }
return(entr)
}
# loading the whole corpus ########################################################################
## define the locations of the corpus files
corpus.files <- # make corpus.files
dir( # the content of the directory/folder
"03_data/Brown_tagged", # with this name
full.names=TRUE) # and retain the complete paths to the files
## create a collector/container for all words from all files
all.corpus.sentences <- c()
## load each corpus file, extract the words, and store in collector/container for later
for (counter in 1:15) { # for each of the 15 corpus files
current.corpus.file <- # make current.corpus.file the result of
scan( # loading
corpus.files[counter], # the corpus file number 'counter'
what=character(), # which contains character strings
sep="\n", # separated by line breaks
quiet=TRUE) # suppress output
current.corpus.file <- # make current.corpus.file the result of
gsub( # replacing
"^.*? ", # everything from the beginning of a line till the 1st space
"", # by nothing
current.corpus.file, # in current.corpus.file
perl=TRUE) # using Perl-compatible regular expressions
all.corpus.sentences <- # make all.corpus.sentences
c( # the combination of
all.corpus.sentences, # all.corpus.sentences and
current.corpus.file) # current.corpus.words
}
all.corpus.sentences <- tolower(all.corpus.sentences) # change the output to lower case
# finding verbs after "he" and after "she" ########################################################
verbs.after.he <- # make verbs.after.he the result of
exact.matches.2( # the result of finding
"(?<=\\bhe_pps )[^_]+(?=_v)", # what's right of he_pps and left of _v
all.corpus.sentences) # in all.corpus.sentences
verbs.after.she <- # make verbs.after.he the result of
exact.matches.2( # the result of finding
"(?<=\\bshe_pps )[^_]+(?=_v)", # what's right of she_pps and left of _v
all.corpus.sentences) # in all.corpus.sentences
verbs.after.he.she.concordance <- # make verbs.after.he.she.concordance the result of
c( # combining
verbs.after.he[[4]], # verbs.after.he, the 4th component (= the concordance)
verbs.after.she[[4]]) # verbs.after.she, the 4th component (= the concordance)
verbs.after.he.she.concordance <- # make verbs.after.he.she.concordance the result of
paste( # pasting together
verbs.after.he.she.concordance, # the concordance lines with
grepl( # checking whether
"\\bshe_pps\t", # \\bshe_pps\t is in
verbs.after.he.she.concordance, # verbs.after.he.she.concordance
perl=TRUE), # using Perl-compatible regular expressions
sep="\t") # with tabs as separators
cat( # print
"PRECEDING\tMATCH\tSUBSEQUENT\tSHE", # a file header, followed by
verbs.after.he.she.concordance, # the concordance,
sep="\n", # into separate lines
file="09_concordance-surprisal-entropy-practice.csv") # into this file
verbs.after.he <- verbs.after.he[[1]] # reduce verbs.after.he to its first component, the exact matches
verbs.after.she <- verbs.after.she[[1]] # reduce verbs.after.she to its first component, the exact matches
all.verbs <- # make all.verbs the result of
c( # combining
verbs.after.he, # verbs.after.he, the first component (the exact matches)
verbs.after.she) # verbs.after.she, the first component (the exact matches)
all.pronouns <- # make all.pronouns the result of
rep( # repeating
c("he", "she"), # "he" and "she"
c(length(verbs.after.he), length(verbs.after.she))) # as many times as there were hes and shes
verb.by.gender <- # make verb.by.gender the result of
table( # tabulating
all.verbs, # all.verbs in the rows
all.pronouns) # all.pronouns in the columns
verb.by.gender <- # make verb.by.gender the result of
verb.by.gender[ # verb.by.gender with
order( # ordering the rows
rowSums(verb.by.gender), # by their sums
decreasing=TRUE),] # in descending order
colSums(verb.by.gender) # how many he's and she's: he:4968 she:1486
## he she
## 4968 1486
head(verb.by.gender, 20) # look at the first 20 rows of verb.by.gender
## all.pronouns
## all.verbs he she
## said 422 160
## thought 106 28
## knew 107 26
## saw 94 28
## went 89 24
## felt 85 25
## asked 70 30
## looked 67 30
## wanted 54 25
## told 57 20
## made 63 13
## found 50 23
## came 45 24
## took 50 18
## turned 48 20
## got 46 12
## says 50 7
## stood 38 17
## began 40 11
## wrote 39 12
# entropies #######################################################################################
## compute the 'raw' entropies
entropy(table(verbs.after.he)) # 8.493393
## [1] 8.493393
entropy(table(verbs.after.she)) # 7.637634
## [1] 7.637634
## compute the 'normalized' entropies, which range from 0 to 1
entropy(table(verbs.after.he), normalized=TRUE) # 0.8310253
## [1] 0.8310253
entropy(table(verbs.after.she), normalized=TRUE) # 0.8566328
## [1] 0.8566328
# surprisals ######################################################################################
surprisals.he <- # make surprisals.he the result of
-log2( # taking the negative log to the base of 2 of
verb.by.gender[,"he"] / # the frequencies of the verbs after "he"
4968) # divided by the sum of those frequencies
surprisals.she <- # make surprisals.she the result of
-log2( # taking the negative log to the base of 2 of
verb.by.gender[,"she"] / # the frequencies of the verbs after "she"
1486) # divided by the sum of those frequencies
## look at the top 30 surprising verbs after "he":
head( # the head of
sort( # sorting
surprisals.he[!is.infinite(surprisals.he)], # the values of surprisals.he that are not Inf
decreasing=TRUE), # in descending order
30) # namely the top 30
## clung described developed married settled snarled
## 12.27845 12.27845 12.27845 12.27845 12.27845 12.27845
## sounded tore begged changed dressed frowned
## 12.27845 12.27845 12.27845 12.27845 12.27845 12.27845
## named reflected smoothed submitted addressed admired
## 12.27845 12.27845 12.27845 12.27845 12.27845 12.27845
## apologized arranged beat bit blew breathed
## 12.27845 12.27845 12.27845 12.27845 12.27845 12.27845
## builds chewed compared confessed consulted continues
## 12.27845 12.27845 12.27845 12.27845 12.27845 12.27845
## look at the top 30 surprising verbs after "she":
head( # the head of
sort( # sorting
surprisals.she[!is.infinite(surprisals.she)], # the values of surprisals.he that are not Inf
decreasing=TRUE), # in descending order
30) # namely the top 30
## explained pulled sees considered watched approached
## 10.53722 10.53722 10.53722 10.53722 10.53722 10.53722
## drove finished makes hopes received refused
## 10.53722 10.53722 10.53722 10.53722 10.53722 10.53722
## broke managed tells whispered arrived claims
## 10.53722 10.53722 10.53722 10.53722 10.53722 10.53722
## failed leaned placed recalled sensed urged
## 10.53722 10.53722 10.53722 10.53722 10.53722 10.53722
## followed grabbed grew killed listened rolled
## 10.53722 10.53722 10.53722 10.53722 10.53722 10.53722