rm(list=ls(all=TRUE)) # clear memory
source("http://www.linguistics.ucsb.edu/faculty/stgries/exact.matches.2.r")
entropy <- function (distribution, normalized=FALSE) {
   entr <- -sum((temp <- distribution[distribution > 0]/sum(distribution)) * log2(temp))
   if (normalized) { entr <- entr/log2(length(distribution)) }
   return(entr)
}



# loading the whole corpus ########################################################################
## define the locations of the corpus files
corpus.files <-                  # make corpus.files
      dir(                       # the content of the directory/folder
         "03_data/Brown_tagged", # with this name
         full.names=TRUE)        # and retain the complete paths to the files

## create a collector/container for all words from all files
all.corpus.sentences <- c()

## load each corpus file, extract the words, and store in collector/container for later
for (counter in 1:15) {         # for each of the 15 corpus files
   current.corpus.file <-       # make current.corpus.file the result of
      scan(                     # loading
         corpus.files[counter], # the corpus file number 'counter'
         what=character(),      # which contains character strings
         sep="\n",              # separated by line breaks
         quiet=TRUE)            # suppress output

   current.corpus.file <-     # make current.corpus.file the result of
      gsub(                   # replacing
         "^.*? ",             # everything from the beginning of a line till the 1st space
         "",                  # by nothing
         current.corpus.file, # in current.corpus.file
         perl=TRUE)           # using Perl-compatible regular expressions

   all.corpus.sentences <-     # make all.corpus.sentences
      c(                       # the combination of
         all.corpus.sentences, # all.corpus.sentences and
         current.corpus.file)  # current.corpus.words
}
all.corpus.sentences <- tolower(all.corpus.sentences) # change the output to lower case



# finding verbs after "he" and after "she" ########################################################
verbs.after.he <-                   # make verbs.after.he the result of
   exact.matches.2(                 # the result of finding
      "(?<=\\bhe_pps )[^_]+(?=_v)", # what's right of he_pps and left of _v
      all.corpus.sentences)         # in all.corpus.sentences
verbs.after.she <-                   # make verbs.after.he the result of
   exact.matches.2(                  # the result of finding
      "(?<=\\bshe_pps )[^_]+(?=_v)", # what's right of she_pps and left of _v
      all.corpus.sentences)          # in all.corpus.sentences

verbs.after.he.she.concordance <- # make verbs.after.he.she.concordance the result of
   c(                             # combining
      verbs.after.he[[4]],        # verbs.after.he, the 4th component (= the concordance)
      verbs.after.she[[4]])       # verbs.after.she, the 4th component (= the concordance)

verbs.after.he.she.concordance <-        # make verbs.after.he.she.concordance the result of
   paste(                                # pasting together
      verbs.after.he.she.concordance,    # the concordance lines with
      grepl(                             # checking whether
         "\\bshe_pps\t",                 # \\bshe_pps\t is in
         verbs.after.he.she.concordance, # verbs.after.he.she.concordance
         perl=TRUE),                     # using Perl-compatible regular expressions
      sep="\t")                          # with tabs as separators

cat(                                    # print
   "PRECEDING\tMATCH\tSUBSEQUENT\tSHE", # a file header, followed by
   verbs.after.he.she.concordance,      # the concordance,
   sep="\n",                            # into separate lines
   file="09_concordance-surprisal-entropy-practice.csv") # into this file

verbs.after.he <- verbs.after.he[[1]]   # reduce verbs.after.he to its first component, the exact matches
verbs.after.she <- verbs.after.she[[1]] # reduce verbs.after.she to its first component, the exact matches

all.verbs <-           # make all.verbs the result of
   c(                  # combining
      verbs.after.he,  # verbs.after.he, the first component (the exact matches)
      verbs.after.she) # verbs.after.she, the first component (the exact matches)
all.pronouns <-                                           # make all.pronouns the result of
   rep(                                                   # repeating
      c("he", "she"),                                     # "he" and "she"
      c(length(verbs.after.he), length(verbs.after.she))) # as many times as there were hes and shes

verb.by.gender <-   # make verb.by.gender the result of
   table(           # tabulating
      all.verbs,    # all.verbs in the rows
      all.pronouns) # all.pronouns in the columns
verb.by.gender <-              # make verb.by.gender the result of
   verb.by.gender[             # verb.by.gender with
   order(                      # ordering the rows
      rowSums(verb.by.gender), # by their sums
      decreasing=TRUE),]       # in descending order
colSums(verb.by.gender) # how many he's and she's: he:4968 she:1486
##   he  she
## 4968 1486
head(verb.by.gender, 20) # look at the first 20 rows of verb.by.gender
##          all.pronouns
## all.verbs  he she
##   said    422 160
##   thought 106  28
##   knew    107  26
##   saw      94  28
##   went     89  24
##   felt     85  25
##   asked    70  30
##   looked   67  30
##   wanted   54  25
##   told     57  20
##   made     63  13
##   found    50  23
##   came     45  24
##   took     50  18
##   turned   48  20
##   got      46  12
##   says     50   7
##   stood    38  17
##   began    40  11
##   wrote    39  12
# entropies #######################################################################################
## compute the 'raw' entropies
entropy(table(verbs.after.he))  # 8.493393
## [1] 8.493393
entropy(table(verbs.after.she)) # 7.637634
## [1] 7.637634
## compute the 'normalized' entropies, which range from 0 to 1
entropy(table(verbs.after.he), normalized=TRUE)  # 0.8310253
## [1] 0.8310253
entropy(table(verbs.after.she), normalized=TRUE) # 0.8566328
## [1] 0.8566328
# surprisals ######################################################################################
surprisals.he <-              # make surprisals.he the result of
   -log2(                     # taking the negative log to the base of 2 of
      verb.by.gender[,"he"] / # the frequencies of the verbs after "he"
      4968)                   # divided by the sum of those frequencies
surprisals.she <-              # make surprisals.she the result of
   -log2(                      # taking the negative log to the base of 2 of
      verb.by.gender[,"she"] / # the frequencies of the verbs after "she"
      1486)                    # divided by the sum of those frequencies

## look at the top 30 surprising verbs after "he":
head(                                             # the head of
   sort(                                          # sorting
      surprisals.he[!is.infinite(surprisals.he)], # the values of surprisals.he that are not Inf
      decreasing=TRUE),                           # in descending order
   30)                                            # namely the top 30
##      clung  described  developed    married    settled    snarled
##   12.27845   12.27845   12.27845   12.27845   12.27845   12.27845
##    sounded       tore     begged    changed    dressed    frowned
##   12.27845   12.27845   12.27845   12.27845   12.27845   12.27845
##      named  reflected   smoothed  submitted  addressed    admired
##   12.27845   12.27845   12.27845   12.27845   12.27845   12.27845
## apologized   arranged       beat        bit       blew   breathed
##   12.27845   12.27845   12.27845   12.27845   12.27845   12.27845
##     builds     chewed   compared  confessed  consulted  continues
##   12.27845   12.27845   12.27845   12.27845   12.27845   12.27845
## look at the top 30 surprising verbs after "she":
head(                                               # the head of
   sort(                                            # sorting
      surprisals.she[!is.infinite(surprisals.she)], # the values of surprisals.he that are not Inf
      decreasing=TRUE),                             # in descending order
   30)                                              # namely the top 30
##  explained     pulled       sees considered    watched approached
##   10.53722   10.53722   10.53722   10.53722   10.53722   10.53722
##      drove   finished      makes      hopes   received    refused
##   10.53722   10.53722   10.53722   10.53722   10.53722   10.53722
##      broke    managed      tells  whispered    arrived     claims
##   10.53722   10.53722   10.53722   10.53722   10.53722   10.53722
##     failed     leaned     placed   recalled     sensed      urged
##   10.53722   10.53722   10.53722   10.53722   10.53722   10.53722
##   followed    grabbed       grew     killed   listened     rolled
##   10.53722   10.53722   10.53722   10.53722   10.53722   10.53722