::p_load(tidyverse, readtext,
pacman quanteda, tidytext)
In-class_Ex05
Put all data into one tibular dataframe.
<- "data/MC1/articles" data_folder
Text sensing to extract text
<- readtext(paste0("data/MC1/articles",
text_data "/*"))
OR
<- readtext("data/MC1/articles") text_data
Basic tokenisation
<- text_data %>%
usenet_words unnest_tokens(word, text) %>% #reading the text data
filter(str_detect(word, "[a-z']$"),
!word %in% stop_words$word) #remove stop words
%>%
usenet_words count(word, sort = TRUE)
readtext object consisting of 3261 documents and 0 docvars.
# A data frame: 3,261 × 3
word n text
<chr> <int> <chr>
1 fishing 2177 "\"\"..."
2 sustainable 1525 "\"\"..."
3 company 1036 "\"\"..."
4 practices 838 "\"\"..."
5 industry 715 "\"\"..."
6 transactions 696 "\"\"..."
# ℹ 3,255 more rows
Observations- Most common words are: fishing, sustainable, company
<- usenet_words %>%
temp_table count(word, sort = TRUE)
Creating a table to observe word counts
<- corpus(text_data)
corpus_text summary(corpus_text, 5)
Corpus consisting of 338 documents, showing 5 documents:
Text Types Tokens Sentences
Alvarez PLC__0__0__Haacklee Herald.txt 206 433 18
Alvarez PLC__0__0__Lomark Daily.txt 102 170 12
Alvarez PLC__0__0__The News Buoy.txt 90 200 9
Alvarez PLC__0__1__Haacklee Herald.txt 96 187 8
Alvarez PLC__0__1__Lomark Daily.txt 241 504 21
To separate the data; with 2 columns X & Y. Some text are “1” hence the split does not occur
<- text_data %>%
text_data_splitted separate_wider_delim("doc_id",
delim = "__0__",
names = c("X", "Y"),
too_few = "align_end")
::p_load(jsonlite, tidyverse) pacman
##pacman::p_load(jsonlite, tidygraph,
##ggraph, tidyverse, readtext,
##quanteda, tidytext)
In the code chunk below, fromJSON() of jsonlite package is used to import MC3.json into R environment.
<- fromJSON("data/MC1/mc1.json") mc1_data