rm(list=ls(all=TRUE)) # clear memory
source("http://www.linguistics.ucsb.edu/faculty/stgries/exact.matches.2.r")
# collecting all verbs and all ditransitives for association strengths ############################
## define the locations of the corpus files
corpus.files <- # make corpus.files
dir( # the content of the directory/folder
"03_data/ICEGB_sampled", # with this name
full.names=TRUE) # and retain the complete paths to the files
## create a collector/container for all verbs from all files and all ditransitives
all.verbs <- all.ditrans <- c()
for (counter in 1:500) { # for each of the 500 corpus files
current.corpus.file <- # make current.corpus.file the result of
scan( # loading
corpus.files[counter], # the corpus file number 'counter'
what=character(), # which contains character strings
sep="\n", # separated by line breaks
quiet=TRUE) # suppress output
curr.verbs <- # make current.corpus.file the result of
grep( # # finding
"\\bV\\(", # V before (
current.corpus.file, # in current.corpus.file
perl=TRUE, # using Perl-compatible regular expressions
value=TRUE) # return what you find (i.e. the verbs with their annotation)
curr.ditrans <- # make current.corpus.file the result of
grepl( # deciding whether
"V\\(ditr", # V before (ditr occurs
curr.verbs, # in curr.verbs (!)
perl=TRUE) # using Perl-compatible regular expressions
all.verbs <- # make all.verbs the result of
c( # combining
all.verbs, # all.verbs and
tolower(curr.verbs)) # curr.verbs in lower case
all.ditrans <- # make all.ditrans the result of
c( # combining
all.ditrans, # all.ditrans
curr.ditrans) # curr.ditrans
}
# cross-tabulate verbs and ditransitives ##########################################################
all.verbs <- # make all.verbs the result of
exact.matches.2( # finding
"(?<={)[^}]+", # what's right of { and is not a }
all.verbs, # in all.verbs
gen.conc.output=FALSE)[[1]] # suppress concordance output, return only the first component
all.verbs <- # make all.verbs the result of
gsub( # replacing
"<l>", # <l>
"", # by nothing
all.verbs, # in all.verbs
perl=TRUE) # using Perl-compatible regular expressions
verbs.by.ditr <- # make verbs.by.ditr the result of
table( # tabulating
all.verbs, # all.verbs in the rows
all.ditrans) # all.ditrans in the columns
verbs.by.ditr <- # make verbs.by.ditr the result of
verbs.by.ditr[ # verbs.by.ditr with
order( # ordering the rows
verbs.by.ditr[,"TRUE"], # by the frequencies of ditransitives
decreasing=TRUE),] # in descending order
## look at the constructional frequencies:
colSums(verbs.by.ditr) # non-ditransitives:139039 ditransitives:1841
## FALSE TRUE
## 139039 1841
## look at ...
head(verbs.by.ditr, 20) # ... the first 20 rows of verbs.by.ditr
## all.ditrans
## all.verbs FALSE TRUE
## give 206 237
## told 103 219
## tell 147 187
## given 171 142
## gave 97 82
## telling 37 59
## gives 50 58
## giving 82 50
## show 180 44
## ask 178 43
## asked 184 40
## send 112 37
## sent 115 35
## tells 15 34
## offered 42 26
## get 1442 23
## offer 55 18
## cost 22 17
## convinced 3 12
## showed 58 12
### the right column is the promised frequency list of 'things in a slot'
# compute one measure of association strength: residuals from chi-squared #########################
assoc.strengths <- # make assoc.strengths the result of
chisq.test( # a chi-squared test
verbs.by.ditr, # the table verbs.by.ditr
correct=FALSE)$ # use the component called
residuals # residuals
assoc.strengths <- # make assoc.strengths the result of
assoc.strengths[ # assoc.strengths with
order( # ordering the rows
assoc.strengths[,"TRUE"], # by the residuals
decreasing=TRUE),] # in descending order
## look at ...
head(assoc.strengths[,"TRUE"], 30) # ... the most strongly associated verb forms, the top 30
## told give tell given gave telling gives
## 104.71002 96.09577 87.41971 68.18999 52.08543 51.55612 47.63382
## tells giving convinced offered send assured show
## 41.68901 36.75644 26.66126 26.63876 25.12051 24.85666 24.00646
## ask sent cost asked convince remind offer
## 23.60348 23.59880 23.09912 21.66852 20.61375 17.56362 17.45259
## awarded lend taught accorded warned persuaded promised
## 17.28087 15.93787 15.92685 14.95360 14.55760 13.59993 13.50385
## teach informs
## 13.41237 12.89304
### note: this way of cmputing association strengths conflates frequency and effect size (like G-squared)!
# excursus: showing you that residuals from chi-squared are nearly the same as G-squared ##########
g.squareds <- # make g.squareds the result of
rep( # repeating
NA, # NA (no data)
nrow(verbs.by.ditr)) # as many times as there are verbs in the ditransitive
names(g.squareds) <- # make the names of the g.squareds
rownames(verbs.by.ditr) # the verbs in the
for (verb in rownames(verbs.by.ditr)) { # for each of the verb forms
if (verbs.by.ditr[verb,"TRUE"]==0) { next } # if it's not used with the ditransitive, ignore it
temp <- glm(all.ditrans ~ all.verbs==verb, family=binomial) # otherwise compute and ...
g.squareds[verb] <- temp$null.deviance-temp$deviance # ... save G-squared
}
## look at ...
head( # ... the head of the
sort( # sorted
g.squareds, # G-squareds
decreasing=TRUE), # (sorted in descending order)
30) # namely the first 30 items
## told give tell given gave telling
## 1525.42533 1480.30586 1186.94462 815.91777 470.57796 386.66840
## gives giving tells show ask send
## 357.13223 262.05683 235.60342 165.43678 160.79999 157.58579
## sent asked offered cost convinced offer
## 144.28059 142.40063 136.54113 94.78462 89.24610 76.21397
## assured convince remind taught teach awarded
## 76.04456 60.00201 52.09516 50.77835 45.80067 44.58310
## warned showed lend promised informed persuaded
## 44.21980 41.55587 41.52804 41.34500 35.82393 33.15888
g.squareds <- g.squareds[ # make g.squareds a version of g.squareds
rownames(assoc.strengths)] # in the order of verbs in assoc.strengths
plot( # plot
x=assoc.strengths[,"TRUE"], # on the x-axis the assoc. strengths from residuals
xlab="Residuals from chi-squared test", # with this x-axis label
y=g.squareds*sign(assoc.strengths[,"TRUE"]), # on the y-axis the signed g.squareds
ylab="G-squared", # with this y-axis label
type="n") # but actually plot nothing
grid() # add a grid
text( # plot text
x=assoc.strengths[,"TRUE"], # at x-axis values of the assoc. strengths from residuals
y=g.squareds*sign(assoc.strengths[,"TRUE"]), # at y-axis values of the signed g.squareds
labels=names(g.squareds), # namely the verb forms
cex=0.8) # and 20% smaller than normal
cor.test( # compute a correlation test of
assoc.strengths[,"TRUE"], # the association strengths from residuals ...
g.squareds*sign(assoc.strengths[,"TRUE"]), # ... and the signed g.squareds
method="spearman", # use Spearman's rho
alternative="greater") # expecting a positive correlation
##
## Spearman's rank correlation rho
##
## data: assoc.strengths[, "TRUE"] and g.squareds * sign(assoc.strengths[, "TRUE"])
## S = 10855, p-value < 2.2e-16
## alternative hypothesis: true rho is greater than 0
## sample estimates:
## rho
## 0.9874212
# excursus: separating frequency and effect size #################################################
log.odds.ratios <- # make log.odds.ratios the result of
rep( # repeating
NA, # NA (no data)
nrow(verbs.by.ditr)) # as many times as there are verbs in the ditransitive
names(log.odds.ratios) <- # make the names of the log.odds.ratios
rownames(verbs.by.ditr) # the verbs in the
for (verb in rownames(verbs.by.ditr)) { # for each of the verb forms
if (verbs.by.ditr[verb,"TRUE"]==0) { next } # if it's not used with the ditransitive, ignore it
log.odds.ratios[verb] <- # otherwise compute and ...
coef(glm(all.ditrans ~ all.verbs==verb, family=binomial))[2] # ... save a log odds ratio
}
## look at ...
head( # ... the head of the
sort( # sorted
log.odds.ratios, # association strengths
decreasing=TRUE), # (sorted in descending order)
30) # namely the first 30 items
## accorded teaches convincing overpaid loan instruct
## 15.892130 14.891560 14.891560 14.891017 14.891017 14.891017
## assured convinced reminding informs told tells
## 5.939316 5.717258 5.424682 5.424682 5.204696 5.161289
## convince telling tell awarded give gives
## 4.917097 4.823372 4.671176 4.664691 4.600955 4.504517
## lend informing reminded afforded owing persuading
## 4.328205 4.326592 4.326055 4.326055 4.325518 4.324982
## guarantees remind given gave cost persuaded
## 4.324982 4.223914 4.217648 4.201319 4.075735 4.039970
plot( # plot
x=log2(rowSums(verbs.by.ditr)), # on the x-axis the verb form frequencies
xlab="Frequency logged to the based of 2", # with this x-axis label
y=log.odds.ratios, # on the y-axis the association strengths
ylab="Log odds ratio", # with this y-axis label
type="n") # but actually plot nothing
grid() # add a grid
text( # plot text
x=log2(rowSums(verbs.by.ditr)), # at x-axis values that are the verb form frequencies
y=log.odds.ratios, # at y-axis values that are the association strengths
labels=rownames(verbs.by.ditr), # namely the verb forms
cex=0.8) # and 20% smaller than normal