rm(list=ls(all=TRUE)) # clear memory
library(dplyr)
# a frequency list of a single untagged corpus file ###############################################
## load that corpus file
corpus.file <- # make corpus file the result of
scan( # loading
"03_data/Brown1_J_untagged.TXT", # this file
what=character(), # which contains character strings
sep="\n", # separated by line breaks
quiet=TRUE) # suppress output
## look at ...
head(corpus.file, 10) # ... corpus.file, the first 10 lines
## [1] "J01 0010 #1. INTRODUCTION# IT HAS recently become practical to use the"
## [2] "J01 0020 radio emission of the moon and planets as a new source of information"
## [3] "J01 0030 about these bodies and their atmospheres. The results of present observations"
## [4] "J01 0040 of the thermal radio emission of the moon are consistent with"
## [5] "J01 0050 the very low thermal conductivity of the surface layer which was derived"
## [6] "J01 0060 from the variation in the infrared emission during eclipses (e&g&,"
## [7] "J01 0070 Garstung, 1958). When sufficiently accurate and complete measurements"
## [8] "J01 0080 are available, it will be possible to set limits on the thermal"
## [9] "J01 0090 and electrical characteristics of the surface and subsurface materials"
## [10] "J01 0100 of the moon. Observations of the radio emission of a planet"
tail(corpus.file, 10) # ... corpus.file, the last 10 lines
## [1] "J80 1800 The sensing of this rotation by the ~<X> gyro can be utilized"
## [2] "J80 1810 to direct the platform into proper heading. In Fig& 7-4,"
## [3] "J80 1820 the input axis of the three-axis platform is shown at some point on the"
## [4] "J80 1830 earth. The point is at a latitude ~|l, and the platform is at"
## [5] "J80 1840 an error in heading east. The earth is spinning at an angular velocity"
## [6] "J80 1850 ~\\q equal to one revolution per 24 hr&. When the platform is"
## [7] "J80 1860 level, ~|e is a rotation about the ~<Z> axis of the platform"
## [8] "J80 1870 **f. Since the earth is rotating and the unleveled gyro-stabilized platform"
## [9] "J80 1880 is fixed with respect to a reference in space, an observer on"
## [10] "J80 1890 the earth will see the platform rotating (with respect to the earth)."
## remove the line annotation
corpus.file <- # make corpus file the result of
gsub( # replacing
"^.........", # the first 9 characters of each line
"", # by nothing
corpus.file, # in corpus.file
perl=TRUE) # using Perl-compatible regular expressions
## look at ...
head(corpus.file, 10) # ..., corpus.file the first 10 lines
## [1] "#1. INTRODUCTION# IT HAS recently become practical to use the"
## [2] "radio emission of the moon and planets as a new source of information"
## [3] "about these bodies and their atmospheres. The results of present observations"
## [4] "of the thermal radio emission of the moon are consistent with"
## [5] "the very low thermal conductivity of the surface layer which was derived"
## [6] "from the variation in the infrared emission during eclipses (e&g&,"
## [7] "Garstung, 1958). When sufficiently accurate and complete measurements"
## [8] "are available, it will be possible to set limits on the thermal"
## [9] "and electrical characteristics of the surface and subsurface materials"
## [10] "of the moon. Observations of the radio emission of a planet"
tail(corpus.file, 10) # ..., corpus.file the last 10 lines
## [1] "The sensing of this rotation by the ~<X> gyro can be utilized"
## [2] "to direct the platform into proper heading. In Fig& 7-4,"
## [3] "the input axis of the three-axis platform is shown at some point on the"
## [4] "earth. The point is at a latitude ~|l, and the platform is at"
## [5] "an error in heading east. The earth is spinning at an angular velocity"
## [6] "~\\q equal to one revolution per 24 hr&. When the platform is"
## [7] "level, ~|e is a rotation about the ~<Z> axis of the platform"
## [8] "**f. Since the earth is rotating and the unleveled gyro-stabilized platform"
## [9] "is fixed with respect to a reference in space, an observer on"
## [10] "the earth will see the platform rotating (with respect to the earth)."
## identify all characters used in the file
strsplit(corpus.file, "") %>% # split up corpus files at every character, then
unlist %>% # make the output a vector, then
unique %>% # find the unique types, then
sort # sort and print the output
## [1] " " "_" "-" "," ";" ":" "!" "?" "." "'" "\"" "(" ")" "["
## [15] "]" "{" "}" "@" "*" "/" "\\" "&" "#" "%" "`" "^" "+" "<"
## [29] ">" "|" "~" "$" "0" "1" "2" "3" "4" "5" "6" "7" "8" "9"
## [43] "a" "A" "b" "B" "c" "C" "d" "D" "e" "E" "f" "F" "g" "G"
## [57] "h" "H" "i" "I" "j" "J" "k" "K" "l" "L" "m" "M" "n" "N"
## [71] "o" "O" "p" "P" "q" "Q" "r" "R" "s" "S" "t" "T" "u" "U"
## [85] "v" "V" "w" "W" "x" "X" "y" "Y" "z" "Z"
## extract all words from the file
strsplit(corpus.file, # split up corpus.file
"[^\\p{L}]+", # at everything that's not 1 or more letters
perl=TRUE) %>% # using Perl-compatible regular expressions, then
unlist %>% # make that a vector, then
tolower -> corpus.words # concert to lower case and put it into corpus.words
## tabulate and sort by frequency
table(corpus.words) %>% # tabulate/count the elements in corpus.words, then
sort(., decreasing=TRUE) -> corpus.freqlist # sort descending order & put it into corpus.freqlist
## look at ...
head(corpus.freqlist, 30) # the 30 most frequent words
## corpus.words
## the of and in to a is that for be as by
## 12540 7459 4288 4110 3958 3530 2410 1715 1568 1360 1291 1227
## with it was f are this on or which from not
## 1158 1149 1115 1009 993 993 957 907 835 795 793 787
## an at were have one s
## 739 739 633 551 550 539
tail(corpus.freqlist, 30) # some of the 30 least frequent words
## corpus.words
## yakima yale yamabe yarns yd yearbook
## 1 1 1 1 1 1
## yearnings yeasts yeats yielding yinger yok
## 1 1 1 1 1 1
## yorker yorkers yorktown younger yours yourselves
## 1 1 1 1 1 1
## yr yucatan yujobo yuri yyyy zabel
## 1 1 1 1 1 1
## zeal zeros zest zinc zones zur
## 1 1 1 1 1 1
## plot a Zipfian curve
plot( # plot
x=log2(rank(-corpus.freqlist)), # on the x-axis the logged ranks of the words in the freq list
y=log2(corpus.freqlist), # on the y-axis the logged frequencies of the words in the freq list
pch=16, col="#00000020") # filled circles in grey
grid() # add a grid
# a frequency list of a several tagged corpus files ###############################################
## define the locations of the corpus files
corpus.files <- # make corpus.files
dir( # the content of the directory/folder
"03_data/Brown_tagged", # with this name
full.names=TRUE) # and retain the complete paths to the files
## create a collector/container for all words from all files
all.corpus.words <- c()
## load each corpus file, extract the words, and store in collector/container for later
for (counter in 1:15) { # for each of the 15 corpus files
current.corpus.file <- # make current.corpus.file the result of
scan( # loading
corpus.files[counter], # the corpus file number 'counter'
what=character(), # which contains character strings
sep="\n", # separated by line breaks
quiet=TRUE) # suppress output
current.corpus.file <- # make current.corpus.file the result of
gsub( # replacing
"^.*? ", # everything from the beginning of a line till the 1st space
"", # by nothing
current.corpus.file, # in current.corpus.file
perl=TRUE) # using Perl-compatible regular expressions
strsplit( # split up
current.corpus.file, # current.corpus.file
"_[^ ]+ ?", # at every _ followed by non-spaces till maybe a space
perl=TRUE) %>% # using Perl-compatible regular expressions, then
unlist %>% # make the resulting object a vector, then
tolower -> # make that be all lowercase, then put this into
current.corpus.words # an object called current.corpus.words
all.corpus.words <- # make all.corpus.words
c( # the combination of
all.corpus.words, # all.corpus.words and
current.corpus.words) # current.corpus.words
}
all.corpus.words <- # make all.corpus.words the result of
grep( # finding
"\\p{L}", # elements with at least one letter
all.corpus.words, # in all.corpus.words
perl=TRUE, # using Perl-compatible regular expressions
value=TRUE) # return what you find (i.e. the words)
## tabulate and sort by frequency
table(all.corpus.words) %>% # tabulate/count the elements in corpus.words, then
sort(., decreasing=TRUE) -> corpus.freqlist # sort descending order & put it into corpus.freqlist
## look at ...
head(corpus.freqlist, 30) # the 30 most frequent words
## all.corpus.words
## the of and to a in that is was he for it
## 70005 36423 28861 26178 23140 21344 10602 10109 9836 9565 9493 8763
## with as his on be at by i this had not are
## 7295 7252 6997 6744 6380 5383 5305 5159 5146 5140 4612 4394
## but from or have an they
## 4383 4373 4205 3946 3741 3624
tail(corpus.freqlist, 30) # some of the 30 least frequent words
## all.corpus.words
## zend-avesta zennist zeroed
## 1 1 1
## zhitkov zhitzhakli zhok
## 1 1 1
## ziggy ziminska-sygietynska zingggg-o
## 1 1 1
## zinman zionism zionists
## 1 1 1
## zip zipped zipper
## 1 1 1
## ziraldo zlotys zoe
## 1 1 1
## zombie zombies zoned
## 1 1 1
## zooey zoology zoomed
## 1 1 1
## zooming zooms zoooop
## 1 1 1
## zorrillas zu zwei
## 1 1 1
plot( # plot
x=log2(rank(-corpus.freqlist)), # on the x-axis the logged ranks of the words in the freq list
y=log2(corpus.freqlist), # on the y-axis the logged frequencies of the words in the freq list
pch=16, col="#00000020") # filled circles in grey
grid() # add a grid
# I will show you a frequency list of 'things in a slot' later.