rm(list=ls(all=TRUE)) # clear memory
source("http://www.linguistics.ucsb.edu/faculty/stgries/exact.matches.2.r")

# collecting all verbs and all ditransitives for association strengths ############################
## define the locations of the corpus files
corpus.files <-                   # make corpus.files
      dir(                        # the content of the directory/folder
         "03_data/ICEGB_sampled", # with this name
         full.names=TRUE)         # and retain the complete paths to the files

## create a collector/container for all verbs from all files and all ditransitives
all.verbs <- all.ditrans <- c()

for (counter in 1:500) {        # for each of the 500 corpus files
   current.corpus.file <-       # make current.corpus.file the result of
      scan(                     # loading
         corpus.files[counter], # the corpus file number 'counter'
         what=character(),      # which contains character strings
         sep="\n",              # separated by line breaks
         quiet=TRUE)            # suppress output

   curr.verbs <-              # make current.corpus.file the result of
      grep(                   # # finding
         "\\bV\\(",           # V before (
         current.corpus.file, # in current.corpus.file
         perl=TRUE,           # using Perl-compatible regular expressions
         value=TRUE)          # return what you find (i.e. the verbs with their annotation)
   curr.ditrans <-   # make current.corpus.file the result of
      grepl(         # deciding whether
         "V\\(ditr", # V before (ditr occurs
         curr.verbs, # in curr.verbs (!)
         perl=TRUE)  # using Perl-compatible regular expressions

   all.verbs <-               # make all.verbs the result of
      c(                      # combining
         all.verbs,           # all.verbs and
         tolower(curr.verbs)) # curr.verbs in lower case
   all.ditrans <-      # make all.ditrans the result of
      c(               # combining
         all.ditrans,  # all.ditrans
         curr.ditrans) # curr.ditrans
}



# cross-tabulate verbs and ditransitives ##########################################################
all.verbs <-                      # make all.verbs the result of
   exact.matches.2(               # finding
      "(?<={)[^}]+",              # what's right of { and is not a }
      all.verbs,                  # in all.verbs
      gen.conc.output=FALSE)[[1]] # suppress concordance output, return only the first component

all.verbs <-     # make all.verbs the result of
   gsub(         # replacing
      "<l>",     # <l>
      "",        # by nothing
      all.verbs, # in all.verbs
      perl=TRUE) # using Perl-compatible regular expressions

verbs.by.ditr <-   # make verbs.by.ditr the result of
   table(          # tabulating
      all.verbs,   # all.verbs in the rows
      all.ditrans) # all.ditrans in the columns
verbs.by.ditr <-  # make verbs.by.ditr the result of
   verbs.by.ditr[ # verbs.by.ditr with
   order(                     # ordering the rows
      verbs.by.ditr[,"TRUE"], # by the frequencies of ditransitives
      decreasing=TRUE),]      # in descending order

## look at the constructional frequencies:
colSums(verbs.by.ditr) # non-ditransitives:139039 ditransitives:1841
##  FALSE   TRUE
## 139039   1841
## look at ...
head(verbs.by.ditr, 20) # ... the first 20 rows of verbs.by.ditr
##            all.ditrans
## all.verbs   FALSE TRUE
##   give        206  237
##   told        103  219
##   tell        147  187
##   given       171  142
##   gave         97   82
##   telling      37   59
##   gives        50   58
##   giving       82   50
##   show        180   44
##   ask         178   43
##   asked       184   40
##   send        112   37
##   sent        115   35
##   tells        15   34
##   offered      42   26
##   get        1442   23
##   offer        55   18
##   cost         22   17
##   convinced     3   12
##   showed       58   12
### the right column is the promised frequency list of 'things in a slot'



# compute one measure of association strength: residuals from chi-squared #########################
assoc.strengths <-    # make assoc.strengths the result of
   chisq.test(        # a chi-squared test
      verbs.by.ditr,  # the table verbs.by.ditr
      correct=FALSE)$ # use the component called
      residuals       # residuals
assoc.strengths <-              # make assoc.strengths the result of
   assoc.strengths[             # assoc.strengths with
   order(                       # ordering the rows
      assoc.strengths[,"TRUE"], # by the residuals
      decreasing=TRUE),]        # in descending order

## look at ...
head(assoc.strengths[,"TRUE"], 30) # ... the most strongly associated verb forms, the top 30
##      told      give      tell     given      gave   telling     gives
## 104.71002  96.09577  87.41971  68.18999  52.08543  51.55612  47.63382
##     tells    giving convinced   offered      send   assured      show
##  41.68901  36.75644  26.66126  26.63876  25.12051  24.85666  24.00646
##       ask      sent      cost     asked  convince    remind     offer
##  23.60348  23.59880  23.09912  21.66852  20.61375  17.56362  17.45259
##   awarded      lend    taught  accorded    warned persuaded  promised
##  17.28087  15.93787  15.92685  14.95360  14.55760  13.59993  13.50385
##     teach   informs
##  13.41237  12.89304
### note: this way of cmputing association strengths conflates frequency and effect size (like G-squared)!



# excursus: showing you that residuals from chi-squared are nearly the same as G-squared ##########
g.squareds <-              # make g.squareds the result of
   rep(                    # repeating
      NA,                  # NA (no data)
      nrow(verbs.by.ditr)) # as many times as there are verbs in the ditransitive
names(g.squareds) <-       # make the names of the g.squareds
   rownames(verbs.by.ditr) # the verbs in the

for (verb in rownames(verbs.by.ditr)) { # for each of the verb forms
   if (verbs.by.ditr[verb,"TRUE"]==0) { next } # if it's not used with the ditransitive, ignore it
   temp <- glm(all.ditrans ~ all.verbs==verb, family=binomial) # otherwise compute and ...
   g.squareds[verb] <- temp$null.deviance-temp$deviance        # ... save G-squared
}

## look at ...
head(                   # ... the head of the
   sort(                # sorted
      g.squareds,       # G-squareds
      decreasing=TRUE), # (sorted in descending order)
   30)                  # namely the first 30 items
##       told       give       tell      given       gave    telling
## 1525.42533 1480.30586 1186.94462  815.91777  470.57796  386.66840
##      gives     giving      tells       show        ask       send
##  357.13223  262.05683  235.60342  165.43678  160.79999  157.58579
##       sent      asked    offered       cost  convinced      offer
##  144.28059  142.40063  136.54113   94.78462   89.24610   76.21397
##    assured   convince     remind     taught      teach    awarded
##   76.04456   60.00201   52.09516   50.77835   45.80067   44.58310
##     warned     showed       lend   promised   informed  persuaded
##   44.21980   41.55587   41.52804   41.34500   35.82393   33.15888
g.squareds <- g.squareds[     # make g.squareds a version of g.squareds
   rownames(assoc.strengths)] # in the order of verbs in assoc.strengths

plot(                                           # plot
   x=assoc.strengths[,"TRUE"],                  # on the x-axis the assoc. strengths from residuals
   xlab="Residuals from chi-squared test",      # with this x-axis label
   y=g.squareds*sign(assoc.strengths[,"TRUE"]), # on the y-axis the signed g.squareds
   ylab="G-squared",                            # with this y-axis label
   type="n")                                    # but actually plot nothing
   grid() # add a grid
   text(                           # plot text
      x=assoc.strengths[,"TRUE"],  # at x-axis values of the assoc. strengths from residuals
      y=g.squareds*sign(assoc.strengths[,"TRUE"]), # at y-axis values of the signed g.squareds
      labels=names(g.squareds),    # namely the verb forms
      cex=0.8)                     # and 20% smaller than normal

cor.test(                                     # compute a correlation test of
   assoc.strengths[,"TRUE"],                  # the association strengths from residuals ...
   g.squareds*sign(assoc.strengths[,"TRUE"]), # ... and the signed g.squareds
   method="spearman",                         # use Spearman's rho
   alternative="greater")                     # expecting a positive correlation
##
##  Spearman's rank correlation rho
##
## data:  assoc.strengths[, "TRUE"] and g.squareds * sign(assoc.strengths[, "TRUE"])
## S = 10855, p-value < 2.2e-16
## alternative hypothesis: true rho is greater than 0
## sample estimates:
##       rho
## 0.9874212
# excursus: separating frequency and effect size #################################################
log.odds.ratios <-         # make log.odds.ratios the result of
   rep(                    # repeating
      NA,                  # NA (no data)
      nrow(verbs.by.ditr)) # as many times as there are verbs in the ditransitive
names(log.odds.ratios) <-  # make the names of the log.odds.ratios
   rownames(verbs.by.ditr) # the verbs in the

for (verb in rownames(verbs.by.ditr)) { # for each of the verb forms
   if (verbs.by.ditr[verb,"TRUE"]==0) { next } # if it's not used with the ditransitive, ignore it
   log.odds.ratios[verb] <-                                        # otherwise compute and ...
      coef(glm(all.ditrans ~ all.verbs==verb, family=binomial))[2] # ... save a log odds ratio
}

## look at ...
head(                   # ... the head of the
   sort(                # sorted
      log.odds.ratios,  # association strengths
      decreasing=TRUE), # (sorted in descending order)
   30)                  # namely the first 30 items
##   accorded    teaches convincing   overpaid       loan   instruct
##  15.892130  14.891560  14.891560  14.891017  14.891017  14.891017
##    assured  convinced  reminding    informs       told      tells
##   5.939316   5.717258   5.424682   5.424682   5.204696   5.161289
##   convince    telling       tell    awarded       give      gives
##   4.917097   4.823372   4.671176   4.664691   4.600955   4.504517
##       lend  informing   reminded   afforded      owing persuading
##   4.328205   4.326592   4.326055   4.326055   4.325518   4.324982
## guarantees     remind      given       gave       cost  persuaded
##   4.324982   4.223914   4.217648   4.201319   4.075735   4.039970
plot(                                         # plot
   x=log2(rowSums(verbs.by.ditr)),            # on the x-axis the verb form frequencies
   xlab="Frequency logged to the based of 2", # with this x-axis label
   y=log.odds.ratios,                         # on the y-axis the association strengths
   ylab="Log odds ratio",                     # with this y-axis label
   type="n")                                  # but actually plot nothing
   grid() # add a grid
   text(                              # plot text
      x=log2(rowSums(verbs.by.ditr)), # at x-axis values that are the verb form frequencies
      y=log.odds.ratios,              # at y-axis values that are the association strengths
      labels=rownames(verbs.by.ditr), # namely the verb forms
      cex=0.8)                        # and 20% smaller than normal