rm(list=ls(all=TRUE)) # clear memory
library(dplyr)
# collecting all word and where they occur ########################################################
## define the locations of the corpus files
corpus.files <- # make corpus.files
dir( # the content of the directory/folder
"03_data/Brown_tagged", # with this name
full.names=TRUE) # and retain the complete paths to the files
## create a collector/container for all words from all files and all files
all.corpus.words <- c(); all.corpus.files <- c()
## load each corpus file, extract the words, and store in collector/container for later
for (counter in 1:15) { # for each of the 15 corpus files
current.corpus.file <- # make current.corpus.file the result of
scan( # loading
corpus.files[counter], # the corpus file number 'counter'
what=character(), # which contains character strings
sep="\n", # separated by line breaks
quiet=TRUE) # suppress output
current.corpus.file <- # make current.corpus.file the result of
gsub( # replacing
"^.*? ", # everything from the beginning of a line till the 1st space
"", # by nothing
current.corpus.file, # in current.corpus.file
perl=TRUE) # using Perl-compatible regular expressions
strsplit( # split up
current.corpus.file, # current.corpus.file
"_[^ ]+ ?", # at every _ followed by non-spaces till maybe a space
perl=TRUE) %>% # using Perl-compatible regular expressions, then
unlist %>% # make the resulting object a vector, then
tolower -> # make that be all lowercase, then put this into
current.corpus.words # an object called current.corpus.words
all.corpus.words <- # make all.corpus.words
c( # the combination of
all.corpus.words, # all.corpus.words and
current.corpus.words) # current.corpus.words
all.corpus.files <- # make all.corpus.files
c( # the combination of
all.corpus.files, # all.corpus.files and
rep(basename(corpus.files[counter]), # repeating the name of the current corpus file
length(current.corpus.words)) # as many times as there are words in it
)
}
# visual exploration and computing DP #############################################################
par(mfrow=c(2, 2)) # set up a plotting panel with 2 rows and 2 columns
word <- "enormous" # define a word you are interested in
plot( # plot
all.corpus.words==word, # whether each word is your word list
type="h", main="enormous") # as a vertical-line histogram with a main indicating the word
wheres.the.word <- # make wheres.the.word
table( # the table with
all.corpus.files, # the files in the rows and
all.corpus.words==word) # whether the word is "enormous" or not in the columns
wheres.the.word # show that table
##
## all.corpus.files FALSE TRUE
## BROWN1_A.TXT 98916 1
## BROWN1_B.TXT 60537 2
## BROWN1_C.TXT 39778 2
## BROWN1_D.TXT 38613 0
## BROWN1_E.TXT 81429 3
## BROWN1_F.TXT 108401 3
## BROWN1_G.TXT 169498 8
## BROWN1_H.TXT 69683 1
## BROWN1_J.TXT 179942 5
## BROWN1_K.TXT 66561 7
## BROWN1_L.TXT 55173 0
## BROWN1_M.TXT 13840 2
## BROWN1_N.TXT 66813 0
## BROWN1_P.TXT 67584 2
## BROWN1_R.TXT 21125 1
(obs.perc <- # make obs.perc the result of
wheres.the.word[,"TRUE"] / # dividing the frequencies of "enormous" per file by
sum(wheres.the.word[,"TRUE"])) # the frequency of "enormous"
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.02702703 0.05405405 0.05405405 0.00000000 0.08108108
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.08108108 0.21621622 0.02702703 0.13513514 0.18918919
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.00000000 0.05405405 0.00000000 0.05405405 0.02702703
(exp.perc <- # make exp.perc the result of
rowSums(wheres.the.word) / # dividing the sizes of the files in words by
sum(wheres.the.word)) # the corpus size
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.08692714 0.05320099 0.03495821 0.03393267 0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.09526421 0.14895995 0.06123751 0.15813539 0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.04848541 0.01216419 0.05871451 0.05939381 0.01856529
sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "enormous"
## [1] 0.2777663
## hi values mean the word is unevenly distributed
## lo values mean the word is evenly distributed
word <- "staining" # define a word you are interested in
plot( # plot
all.corpus.words==word, # whether each word is your word list
type="h", main="staining") # as a vertical-line histogram with a main indicating the word
wheres.the.word <- # make wheres.the.word
table( # the table with
all.corpus.files, # the files in the rows and
all.corpus.words==word) # whether the word is "staining" or not in the columns
wheres.the.word # show that table
##
## all.corpus.files FALSE TRUE
## BROWN1_A.TXT 98917 0
## BROWN1_B.TXT 60539 0
## BROWN1_C.TXT 39780 0
## BROWN1_D.TXT 38613 0
## BROWN1_E.TXT 81432 0
## BROWN1_F.TXT 108404 0
## BROWN1_G.TXT 169506 0
## BROWN1_H.TXT 69684 0
## BROWN1_J.TXT 179910 37
## BROWN1_K.TXT 66568 0
## BROWN1_L.TXT 55173 0
## BROWN1_M.TXT 13842 0
## BROWN1_N.TXT 66813 0
## BROWN1_P.TXT 67586 0
## BROWN1_R.TXT 21126 0
(obs.perc <- # make obs.perc the result of
wheres.the.word[,"TRUE"] / # dividing the frequencies of "staining" per file by
sum(wheres.the.word[,"TRUE"])) # the frequency of "staining"
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0 0 0 0 0
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0 0 0 1 0
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0 0 0 0 0
(exp.perc <- # make exp.perc the result of
rowSums(wheres.the.word) / # dividing the sizes of the files in words by
sum(wheres.the.word)) # the corpus size
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.08692714 0.05320099 0.03495821 0.03393267 0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.09526421 0.14895995 0.06123751 0.15813539 0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.04848541 0.01216419 0.05871451 0.05939381 0.01856529
sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "staining"
## [1] 0.8418646
word <- "church" # define a word you are interested in
plot( # plot
all.corpus.words==word, # whether each word is your word list
type="h", main="church") # as a vertical-line histogram with a main indicating the word
wheres.the.word <- # make wheres.the.word
table( # the table with
all.corpus.files, # the files in the rows and
all.corpus.words==word) # whether the word is "church" or not in the columns
wheres.the.word # show that table
##
## all.corpus.files FALSE TRUE
## BROWN1_A.TXT 98884 33
## BROWN1_B.TXT 60514 25
## BROWN1_C.TXT 39780 0
## BROWN1_D.TXT 38519 94
## BROWN1_E.TXT 81429 3
## BROWN1_F.TXT 108339 65
## BROWN1_G.TXT 169480 26
## BROWN1_H.TXT 69681 3
## BROWN1_J.TXT 179939 8
## BROWN1_K.TXT 66512 56
## BROWN1_L.TXT 55171 2
## BROWN1_M.TXT 13839 3
## BROWN1_N.TXT 66813 0
## BROWN1_P.TXT 67557 29
## BROWN1_R.TXT 21125 1
(obs.perc <- # make obs.perc the result of
wheres.the.word[,"TRUE"] / # dividing the frequencies of "church" per file by
sum(wheres.the.word[,"TRUE"])) # the frequency of "church"
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.094827586 0.071839080 0.000000000 0.270114943 0.008620690
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.186781609 0.074712644 0.008620690 0.022988506 0.160919540
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.005747126 0.008620690 0.000000000 0.083333333 0.002873563
(exp.perc <- # make exp.perc the result of
rowSums(wheres.the.word) / # dividing the sizes of the files in words by
sum(wheres.the.word)) # the corpus size
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.08692714 0.05320099 0.03495821 0.03393267 0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.09526421 0.14895995 0.06123751 0.15813539 0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.04848541 0.01216419 0.05871451 0.05939381 0.01856529
sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "under"
## [1] 0.4805981
word <- "place" # define a word you are interested in
plot( # plot
all.corpus.words==word, # whether each word is your word list
type="h", main="place") # as a vertical-line histogram with a main indicating the word
wheres.the.word <- # make wheres.the.word
table( # the table with
all.corpus.files, # the files in the rows and
all.corpus.words==word) # whether the word is "place" or not in the columns
wheres.the.word # show that table
##
## all.corpus.files FALSE TRUE
## BROWN1_A.TXT 98884 33
## BROWN1_B.TXT 60523 16
## BROWN1_C.TXT 39757 23
## BROWN1_D.TXT 38601 12
## BROWN1_E.TXT 81366 66
## BROWN1_F.TXT 108334 70
## BROWN1_G.TXT 169427 79
## BROWN1_H.TXT 69654 30
## BROWN1_J.TXT 179883 64
## BROWN1_K.TXT 66525 43
## BROWN1_L.TXT 55140 33
## BROWN1_M.TXT 13834 8
## BROWN1_N.TXT 66774 39
## BROWN1_P.TXT 67541 45
## BROWN1_R.TXT 21116 10
(obs.perc <- # make obs.perc the result of
wheres.the.word[,"TRUE"] / # dividing the frequencies of "place" per file by
sum(wheres.the.word[,"TRUE"])) # the frequency of "place"
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.05779335 0.02802102 0.04028021 0.02101576 0.11558669
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.12259194 0.13835377 0.05253940 0.11208406 0.07530648
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.05779335 0.01401051 0.06830123 0.07880911 0.01751313
(exp.perc <- # make exp.perc the result of
rowSums(wheres.the.word) / # dividing the sizes of the files in words by
sum(wheres.the.word)) # the corpus size
## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
## 0.08692714 0.05320099 0.03495821 0.03393267 0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
## 0.09526421 0.14895995 0.06123751 0.15813539 0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
## 0.04848541 0.01216419 0.05871451 0.05939381 0.01856529
sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "place"
## [1] 0.1336384
par(mfrow=c(1, 1)) # revert to default plotting panel with 1 row and 1 column