03_frequency-practice.r

rm(list=ls(all=TRUE)) # clear memory
library(dplyr)

# a frequency list of a single untagged corpus file ###############################################
## load that corpus file
corpus.file <-                         # make corpus file the result of
   scan(                               # loading
      "03_data/Brown1_J_untagged.TXT", # this file
      what=character(),                # which contains character strings
      sep="\n",                        # separated by line breaks
      quiet=TRUE)                      # suppress output

## look at ...
head(corpus.file, 10) # ... corpus.file, the first 10 lines

##  [1] "J01 0010 #1. INTRODUCTION# IT HAS recently become practical to use the"
##  [2] "J01 0020 radio emission of the moon and planets as a new source of information"
##  [3] "J01 0030 about these bodies and their atmospheres. The results of present observations"
##  [4] "J01 0040 of the thermal radio emission of the moon are consistent with"
##  [5] "J01 0050 the very low thermal conductivity of the surface layer which was derived"
##  [6] "J01 0060 from the variation in the infrared emission during eclipses (e&g&,"
##  [7] "J01 0070 Garstung, 1958). When sufficiently accurate and complete measurements"
##  [8] "J01 0080 are available, it will be possible to set limits on the thermal"
##  [9] "J01 0090 and electrical characteristics of the surface and subsurface materials"
## [10] "J01 0100 of the moon.   Observations of the radio emission of a planet"

tail(corpus.file, 10) # ... corpus.file, the last 10 lines

##  [1] "J80 1800 The sensing of this rotation by the ~<X> gyro can be utilized"
##  [2] "J80 1810 to direct the platform into proper heading.   In Fig& 7-4,"
##  [3] "J80 1820 the input axis of the three-axis platform is shown at some point on the"
##  [4] "J80 1830 earth. The point is at a latitude ~|l, and the platform is at"
##  [5] "J80 1840 an error in heading east. The earth is spinning at an angular velocity"
##  [6] "J80 1850 ~\\q equal to one revolution per 24 hr&. When the platform is"
##  [7] "J80 1860 level, ~|e is a rotation about the ~<Z> axis of the platform"
##  [8] "J80 1870 **f. Since the earth is rotating and the unleveled gyro-stabilized platform"
##  [9] "J80 1880 is fixed with respect to a reference in space, an observer on"
## [10] "J80 1890 the earth will see the platform rotating (with respect to the earth)."

## remove the line annotation
corpus.file <-      # make corpus file the result of
   gsub(            # replacing
      "^.........", # the first 9 characters of each line
      "",           # by nothing
      corpus.file,  # in corpus.file
      perl=TRUE)    # using Perl-compatible regular expressions

## look at ...
head(corpus.file, 10) # ..., corpus.file the first 10 lines

##  [1] "#1. INTRODUCTION# IT HAS recently become practical to use the"
##  [2] "radio emission of the moon and planets as a new source of information"
##  [3] "about these bodies and their atmospheres. The results of present observations"
##  [4] "of the thermal radio emission of the moon are consistent with"
##  [5] "the very low thermal conductivity of the surface layer which was derived"
##  [6] "from the variation in the infrared emission during eclipses (e&g&,"
##  [7] "Garstung, 1958). When sufficiently accurate and complete measurements"
##  [8] "are available, it will be possible to set limits on the thermal"
##  [9] "and electrical characteristics of the surface and subsurface materials"
## [10] "of the moon.   Observations of the radio emission of a planet"

tail(corpus.file, 10) # ..., corpus.file the last 10 lines

##  [1] "The sensing of this rotation by the ~<X> gyro can be utilized"
##  [2] "to direct the platform into proper heading.   In Fig& 7-4,"
##  [3] "the input axis of the three-axis platform is shown at some point on the"
##  [4] "earth. The point is at a latitude ~|l, and the platform is at"
##  [5] "an error in heading east. The earth is spinning at an angular velocity"
##  [6] "~\\q equal to one revolution per 24 hr&. When the platform is"
##  [7] "level, ~|e is a rotation about the ~<Z> axis of the platform"
##  [8] "**f. Since the earth is rotating and the unleveled gyro-stabilized platform"
##  [9] "is fixed with respect to a reference in space, an observer on"
## [10] "the earth will see the platform rotating (with respect to the earth)."

## identify all characters used in the file
strsplit(corpus.file, "") %>% # split up corpus files at every character, then
   unlist                 %>% # make the output a vector, then
   unique                 %>% # find the unique types, then
   sort                       # sort and print the output

##  [1] " "  "_"  "-"  ","  ";"  ":"  "!"  "?"  "."  "'"  "\"" "("  ")"  "["
## [15] "]"  "{"  "}"  "@"  "*"  "/"  "\\" "&"  "#"  "%"  "`"  "^"  "+"  "<"
## [29] ">"  "|"  "~"  "$"  "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"
## [43] "a"  "A"  "b"  "B"  "c"  "C"  "d"  "D"  "e"  "E"  "f"  "F"  "g"  "G"
## [57] "h"  "H"  "i"  "I"  "j"  "J"  "k"  "K"  "l"  "L"  "m"  "M"  "n"  "N"
## [71] "o"  "O"  "p"  "P"  "q"  "Q"  "r"  "R"  "s"  "S"  "t"  "T"  "u"  "U"
## [85] "v"  "V"  "w"  "W"  "x"  "X"  "y"  "Y"  "z"  "Z"

## extract all words from the file
strsplit(corpus.file,         # split up corpus.file
   "[^\\p{L}]+",              # at everything that's not 1 or more letters
   perl=TRUE) %>%             # using Perl-compatible regular expressions, then
   unlist     %>%             # make that a vector, then
   tolower    -> corpus.words # concert to lower case and put it into corpus.words

## tabulate and sort by frequency
table(corpus.words)         %>%                # tabulate/count the elements in corpus.words, then
   sort(., decreasing=TRUE) -> corpus.freqlist # sort descending order & put it into corpus.freqlist

## look at ...
head(corpus.freqlist, 30) # the 30 most frequent words

## corpus.words
##   the    of   and    in    to     a    is  that   for    be    as    by
## 12540  7459  4288  4110  3958  3530  2410  1715  1568  1360  1291  1227
##  with    it   was     f   are  this    on    or which        from   not
##  1158  1149  1115  1009   993   993   957   907   835   795   793   787
##    an    at  were  have   one     s
##   739   739   633   551   550   539

tail(corpus.freqlist, 30) # some of the 30 least frequent words

## corpus.words
##     yakima       yale     yamabe      yarns         yd   yearbook
##          1          1          1          1          1          1
##  yearnings     yeasts      yeats   yielding     yinger        yok
##          1          1          1          1          1          1
##     yorker    yorkers   yorktown    younger      yours yourselves
##          1          1          1          1          1          1
##         yr    yucatan     yujobo       yuri       yyyy      zabel
##          1          1          1          1          1          1
##       zeal      zeros       zest       zinc      zones        zur
##          1          1          1          1          1          1

## plot a Zipfian curve
plot(                              # plot
   x=log2(rank(-corpus.freqlist)), # on the x-axis the logged ranks of the words in the freq list
   y=log2(corpus.freqlist),        # on the y-axis the logged frequencies of the words in the freq list
   pch=16, col="#00000020")        # filled circles in grey
   grid()                          # add a grid

# a frequency list of a several tagged corpus files ###############################################
## define the locations of the corpus files
corpus.files <-                  # make corpus.files
      dir(                       # the content of the directory/folder
         "03_data/Brown_tagged", # with this name
         full.names=TRUE)        # and retain the complete paths to the files

## create a collector/container for all words from all files
all.corpus.words <- c()

## load each corpus file, extract the words, and store in collector/container for later
for (counter in 1:15) {         # for each of the 15 corpus files
   current.corpus.file <-       # make current.corpus.file the result of
      scan(                     # loading
         corpus.files[counter], # the corpus file number 'counter'
         what=character(),      # which contains character strings
         sep="\n",              # separated by line breaks
         quiet=TRUE)            # suppress output

   current.corpus.file <-     # make current.corpus.file the result of
      gsub(                   # replacing
         "^.*? ",             # everything from the beginning of a line till the 1st space
         "",                  # by nothing
         current.corpus.file, # in current.corpus.file
         perl=TRUE)           # using Perl-compatible regular expressions

   strsplit(               # split up
      current.corpus.file, # current.corpus.file
      "_[^ ]+ ?",          # at every _ followed by non-spaces till maybe a space
      perl=TRUE) %>%       # using Perl-compatible regular expressions, then
      unlist     %>%       # make the resulting object a vector, then
      tolower    ->        # make that be all lowercase, then put this into
      current.corpus.words # an object called current.corpus.words

   all.corpus.words <-         # make all.corpus.words
      c(                       # the combination of
         all.corpus.words,     # all.corpus.words and
         current.corpus.words) # current.corpus.words
}

all.corpus.words <-     # make all.corpus.words the result of
   grep(                # finding
      "\\p{L}",         # elements with at least one letter
      all.corpus.words, # in all.corpus.words
      perl=TRUE,        # using Perl-compatible regular expressions
      value=TRUE)       # return what you find (i.e. the words)

## tabulate and sort by frequency
table(all.corpus.words)     %>%                # tabulate/count the elements in corpus.words, then
   sort(., decreasing=TRUE) -> corpus.freqlist # sort descending order & put it into corpus.freqlist

## look at ...
head(corpus.freqlist, 30) # the 30 most frequent words

## all.corpus.words
##   the    of   and    to     a    in  that    is   was    he   for    it
## 70005 36423 28861 26178 23140 21344 10602 10109  9836  9565  9493  8763
##  with    as   his    on    be    at    by     i  this   had   not   are
##  7295  7252  6997  6744  6380  5383  5305  5159  5146  5140  4612  4394
##   but  from    or  have    an  they
##  4383  4373  4205  3946  3741  3624

tail(corpus.freqlist, 30) # some of the 30 least frequent words

## all.corpus.words
##          zend-avesta              zennist               zeroed
##                    1                    1                    1
##              zhitkov           zhitzhakli                 zhok
##                    1                    1                    1
##                ziggy ziminska-sygietynska            zingggg-o
##                    1                    1                    1
##               zinman              zionism             zionists
##                    1                    1                    1
##                  zip               zipped               zipper
##                    1                    1                    1
##              ziraldo               zlotys                  zoe
##                    1                    1                    1
##               zombie              zombies                zoned
##                    1                    1                    1
##                zooey              zoology               zoomed
##                    1                    1                    1
##              zooming                zooms               zoooop
##                    1                    1                    1
##            zorrillas                   zu                 zwei
##                    1                    1                    1

plot(                              # plot
   x=log2(rank(-corpus.freqlist)), # on the x-axis the logged ranks of the words in the freq list
   y=log2(corpus.freqlist),        # on the y-axis the logged frequencies of the words in the freq list
   pch=16, col="#00000020")        # filled circles in grey
   grid()                          # add a grid

# I will show you a frequency list of 'things in a slot' later.

03_frequency-practice.r

Stefan Th. Gries

Sun Sep 30 12:34:56 2018