05_dispersion-practice.r

rm(list=ls(all=TRUE)) # clear memory
library(dplyr)

# collecting all word and where they occur ########################################################
## define the locations of the corpus files
corpus.files <-                  # make corpus.files
      dir(                       # the content of the directory/folder
         "03_data/Brown_tagged", # with this name
         full.names=TRUE)        # and retain the complete paths to the files

## create a collector/container for all words from all files and all files
all.corpus.words <- c(); all.corpus.files <- c()

## load each corpus file, extract the words, and store in collector/container for later
for (counter in 1:15) {         # for each of the 15 corpus files
   current.corpus.file <-       # make current.corpus.file the result of
      scan(                     # loading
         corpus.files[counter], # the corpus file number 'counter'
         what=character(),      # which contains character strings
         sep="\n",              # separated by line breaks
         quiet=TRUE)            # suppress output

   current.corpus.file <-     # make current.corpus.file the result of
      gsub(                   # replacing
         "^.*? ",             # everything from the beginning of a line till the 1st space
         "",                  # by nothing
         current.corpus.file, # in current.corpus.file
         perl=TRUE)           # using Perl-compatible regular expressions

   strsplit(               # split up
      current.corpus.file, # current.corpus.file
      "_[^ ]+ ?",          # at every _ followed by non-spaces till maybe a space
      perl=TRUE) %>%       # using Perl-compatible regular expressions, then
      unlist     %>%       # make the resulting object a vector, then
      tolower    ->        # make that be all lowercase, then put this into
      current.corpus.words # an object called current.corpus.words

   all.corpus.words <-         # make all.corpus.words
      c(                       # the combination of
         all.corpus.words,     # all.corpus.words and
         current.corpus.words) # current.corpus.words

   all.corpus.files <-                        # make all.corpus.files
      c(                                      # the combination of
         all.corpus.files,                    # all.corpus.files and
         rep(basename(corpus.files[counter]), # repeating the name of the current corpus file
            length(current.corpus.words))     # as many times as there are words in it
      )
}



# visual exploration and computing DP #############################################################
par(mfrow=c(2, 2)) # set up a plotting panel with 2 rows and 2 columns
word <- "enormous"            # define a word you are interested in
plot(                         # plot
   all.corpus.words==word,    # whether each word is your word list
   type="h", main="enormous") # as a vertical-line histogram with a main indicating the word
wheres.the.word <-            # make wheres.the.word
   table(                     # the table with
      all.corpus.files,       # the files in the rows and
      all.corpus.words==word) # whether the word is "enormous" or not in the columns
wheres.the.word               # show that table

##
## all.corpus.files  FALSE   TRUE
##     BROWN1_A.TXT  98916      1
##     BROWN1_B.TXT  60537      2
##     BROWN1_C.TXT  39778      2
##     BROWN1_D.TXT  38613      0
##     BROWN1_E.TXT  81429      3
##     BROWN1_F.TXT 108401      3
##     BROWN1_G.TXT 169498      8
##     BROWN1_H.TXT  69683      1
##     BROWN1_J.TXT 179942      5
##     BROWN1_K.TXT  66561      7
##     BROWN1_L.TXT  55173      0
##     BROWN1_M.TXT  13840      2
##     BROWN1_N.TXT  66813      0
##     BROWN1_P.TXT  67584      2
##     BROWN1_R.TXT  21125      1

(obs.perc <-                         # make obs.perc the result of
      wheres.the.word[,"TRUE"] /     # dividing the frequencies of "enormous" per file by
      sum(wheres.the.word[,"TRUE"])) # the frequency of "enormous"

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##   0.02702703   0.05405405   0.05405405   0.00000000   0.08108108
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##   0.08108108   0.21621622   0.02702703   0.13513514   0.18918919
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##   0.00000000   0.05405405   0.00000000   0.05405405   0.02702703

(exp.perc <-                  # make exp.perc the result of
   rowSums(wheres.the.word) / # dividing the sizes of the files in words by
   sum(wheres.the.word))      # the corpus size

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##   0.08692714   0.05320099   0.03495821   0.03393267   0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##   0.09526421   0.14895995   0.06123751   0.15813539   0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##   0.04848541   0.01216419   0.05871451   0.05939381   0.01856529

sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "enormous"

## [1] 0.2777663

## hi values mean the word is unevenly distributed
## lo values mean the word is evenly distributed



word <- "staining"            # define a word you are interested in
plot(                         # plot
   all.corpus.words==word,    # whether each word is your word list
   type="h", main="staining") # as a vertical-line histogram with a main indicating the word
wheres.the.word <-            # make wheres.the.word
   table(                     # the table with
      all.corpus.files,       # the files in the rows and
      all.corpus.words==word) # whether the word is "staining" or not in the columns
wheres.the.word               # show that table

##
## all.corpus.files  FALSE   TRUE
##     BROWN1_A.TXT  98917      0
##     BROWN1_B.TXT  60539      0
##     BROWN1_C.TXT  39780      0
##     BROWN1_D.TXT  38613      0
##     BROWN1_E.TXT  81432      0
##     BROWN1_F.TXT 108404      0
##     BROWN1_G.TXT 169506      0
##     BROWN1_H.TXT  69684      0
##     BROWN1_J.TXT 179910     37
##     BROWN1_K.TXT  66568      0
##     BROWN1_L.TXT  55173      0
##     BROWN1_M.TXT  13842      0
##     BROWN1_N.TXT  66813      0
##     BROWN1_P.TXT  67586      0
##     BROWN1_R.TXT  21126      0

(obs.perc <-                         # make obs.perc the result of
      wheres.the.word[,"TRUE"] /     # dividing the frequencies of "staining" per file by
      sum(wheres.the.word[,"TRUE"])) # the frequency of "staining"

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##            0            0            0            0            0
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##            0            0            0            1            0
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##            0            0            0            0            0

(exp.perc <-                  # make exp.perc the result of
   rowSums(wheres.the.word) / # dividing the sizes of the files in words by
   sum(wheres.the.word))      # the corpus size

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##   0.08692714   0.05320099   0.03495821   0.03393267   0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##   0.09526421   0.14895995   0.06123751   0.15813539   0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##   0.04848541   0.01216419   0.05871451   0.05939381   0.01856529

sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "staining"

## [1] 0.8418646

word <- "church"            # define a word you are interested in
plot(                       # plot
   all.corpus.words==word,  # whether each word is your word list
   type="h", main="church") # as a vertical-line histogram with a main indicating the word
wheres.the.word <-            # make wheres.the.word
   table(                     # the table with
      all.corpus.files,       # the files in the rows and
      all.corpus.words==word) # whether the word is "church" or not in the columns
wheres.the.word               # show that table

##
## all.corpus.files  FALSE   TRUE
##     BROWN1_A.TXT  98884     33
##     BROWN1_B.TXT  60514     25
##     BROWN1_C.TXT  39780      0
##     BROWN1_D.TXT  38519     94
##     BROWN1_E.TXT  81429      3
##     BROWN1_F.TXT 108339     65
##     BROWN1_G.TXT 169480     26
##     BROWN1_H.TXT  69681      3
##     BROWN1_J.TXT 179939      8
##     BROWN1_K.TXT  66512     56
##     BROWN1_L.TXT  55171      2
##     BROWN1_M.TXT  13839      3
##     BROWN1_N.TXT  66813      0
##     BROWN1_P.TXT  67557     29
##     BROWN1_R.TXT  21125      1

(obs.perc <-                         # make obs.perc the result of
      wheres.the.word[,"TRUE"] /     # dividing the frequencies of "church" per file by
      sum(wheres.the.word[,"TRUE"])) # the frequency of "church"

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##  0.094827586  0.071839080  0.000000000  0.270114943  0.008620690
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##  0.186781609  0.074712644  0.008620690  0.022988506  0.160919540
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##  0.005747126  0.008620690  0.000000000  0.083333333  0.002873563

(exp.perc <-                  # make exp.perc the result of
   rowSums(wheres.the.word) / # dividing the sizes of the files in words by
   sum(wheres.the.word))      # the corpus size

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##   0.08692714   0.05320099   0.03495821   0.03393267   0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##   0.09526421   0.14895995   0.06123751   0.15813539   0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##   0.04848541   0.01216419   0.05871451   0.05939381   0.01856529

sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "under"

## [1] 0.4805981

word <- "place"            # define a word you are interested in
plot(                      # plot
   all.corpus.words==word, # whether each word is your word list
   type="h", main="place") # as a vertical-line histogram with a main indicating the word

wheres.the.word <-            # make wheres.the.word
   table(                     # the table with
      all.corpus.files,       # the files in the rows and
      all.corpus.words==word) # whether the word is "place" or not in the columns
wheres.the.word               # show that table

##
## all.corpus.files  FALSE   TRUE
##     BROWN1_A.TXT  98884     33
##     BROWN1_B.TXT  60523     16
##     BROWN1_C.TXT  39757     23
##     BROWN1_D.TXT  38601     12
##     BROWN1_E.TXT  81366     66
##     BROWN1_F.TXT 108334     70
##     BROWN1_G.TXT 169427     79
##     BROWN1_H.TXT  69654     30
##     BROWN1_J.TXT 179883     64
##     BROWN1_K.TXT  66525     43
##     BROWN1_L.TXT  55140     33
##     BROWN1_M.TXT  13834      8
##     BROWN1_N.TXT  66774     39
##     BROWN1_P.TXT  67541     45
##     BROWN1_R.TXT  21116     10

(obs.perc <-                         # make obs.perc the result of
      wheres.the.word[,"TRUE"] /     # dividing the frequencies of "place" per file by
      sum(wheres.the.word[,"TRUE"])) # the frequency of "place"

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##   0.05779335   0.02802102   0.04028021   0.02101576   0.11558669
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##   0.12259194   0.13835377   0.05253940   0.11208406   0.07530648
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##   0.05779335   0.01401051   0.06830123   0.07880911   0.01751313

(exp.perc <-                  # make exp.perc the result of
   rowSums(wheres.the.word) / # dividing the sizes of the files in words by
   sum(wheres.the.word))      # the corpus size

## BROWN1_A.TXT BROWN1_B.TXT BROWN1_C.TXT BROWN1_D.TXT BROWN1_E.TXT
##   0.08692714   0.05320099   0.03495821   0.03393267   0.07156152
## BROWN1_F.TXT BROWN1_G.TXT BROWN1_H.TXT BROWN1_J.TXT BROWN1_K.TXT
##   0.09526421   0.14895995   0.06123751   0.15813539   0.05849920
## BROWN1_L.TXT BROWN1_M.TXT BROWN1_N.TXT BROWN1_P.TXT BROWN1_R.TXT
##   0.04848541   0.01216419   0.05871451   0.05939381   0.01856529

sum(abs(obs.perc - exp.perc)) / 2 # compute Gries's DP for "place"

## [1] 0.1336384

par(mfrow=c(1, 1)) # revert to default plotting panel with 1 row and 1 column

05_dispersion-practice.r

Stefan Th. Gries

Sun Sept 30 12:34:56 2018