In this task, we will analyze text data. First, we will count how many times each word appears in our dataset. Then, we will create a word cloud to show which words are used most often. We will use the Tidy Text Mining website for guidance. We will also look at pairs of words (bigrams) to see common phrases and perform sentiment analysis to find out if the text is positive, negative, or neutral.


Load necessary libraries

# Load required packages for web scraping, text mining, and visualizing
library(rvest)
library(stringr)
library(tidytext)
library(tm)
library(wordcloud)
library(ggplot2)
library(tidyr)
library(dplyr)


Web Scraping


Specify the base URL of the website

base_url <- "https://www.tidytextmining.com"


Scrape content from each page

all_page_content <- list()

for (full_url in page_urls) {
  cat("Scraping content from:", full_url, "\n")
  
  webpage <- read_html(full_url)
  
  content_section <- html_nodes(webpage, css = "#content")
  content_text <- html_text(content_section)
  
  cleaned_content <- str_replace_all(content_text, "<script.*?</script>|<style.*?</style>", "")
  final_content <- str_squish(cleaned_content)
  
  all_page_content[[full_url]] <- final_content
}
## Scraping content from: https://www.tidytextmining.com/ 
## Scraping content from: https://www.tidytextmining.com/preface 
## Scraping content from: https://www.tidytextmining.com/tidytext 
## Scraping content from: https://www.tidytextmining.com/sentiment 
## Scraping content from: https://www.tidytextmining.com/tfidf 
## Scraping content from: https://www.tidytextmining.com/ngrams 
## Scraping content from: https://www.tidytextmining.com/dtm 
## Scraping content from: https://www.tidytextmining.com/topicmodeling 
## Scraping content from: https://www.tidytextmining.com/twitter 
## Scraping content from: https://www.tidytextmining.com/nasa 
## Scraping content from: https://www.tidytextmining.com/usenet 
## Scraping content from: https://www.tidytextmining.com/references


Combine all scraped content into a single data frame, and count the total words

all_content_df <- data.frame(page_url = names(all_page_content), content = unlist(all_page_content))

total_word_count <- sum(nchar(all_content_df$content) - nchar(gsub("\\s+", "", all_content_df$content)))

cat("Total word count:", total_word_count, "\n")
## Total word count: 39010


Data Cleaning


Clean the corpus and create a term-document matrix

corpus <- Corpus(VectorSource(all_content_df$content))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))

tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)

word_freq <- sort(rowSums(m), decreasing = TRUE)
valid_words <- grep("^[a-zA-Z]+$", names(word_freq), value = TRUE)


Visualization


Generate word cloud based on filtered word frequencies

wordcloud(words = valid_words, freq = word_freq[valid_words], min.freq = 10, 
          random.order = FALSE, colors = brewer.pal(8, "Dark2"))


Create a bar plot for the top N words by frequency

top_n <- 10
top_words <- head(valid_words, n = top_n)
word_counts <- word_freq[top_words]

bar_data <- data.frame(word = top_words, frequency = word_counts)
bar_data <- bar_data[order(-bar_data$frequency), ]

ggplot(bar_data, aes(x = reorder(word, frequency), y = frequency)) +
  geom_bar(stat = "identity") +
  labs(title = paste("Top", top_n, "Words by Frequency"),
       x = "Word", y = "Frequency") +
  theme_minimal() +
  coord_flip()


Analysis


Perform sentiment analysis and visualize results

corpus_tokens <- all_content_df %>%
  mutate(content = tolower(content)) %>%
  unnest_tokens(word, content) %>%
  anti_join(stop_words)

sentiment_scores <- corpus_tokens %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(word, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment_score = positive - negative)

top_sentiment_words <- sentiment_scores %>%
  top_n(10, abs(sentiment_score)) %>%
  mutate(word = reorder(word, sentiment_score))

ggplot(top_sentiment_words, aes(x = word, y = sentiment_score, fill = sentiment_score > 0)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  scale_fill_manual(values = c("red", "green"), guide = FALSE) +
  labs(title = "Sentiment Analysis (Top 10 Words)",
       x = "Word", y = "Sentiment Score") +
  theme_minimal() +
  coord_flip()


Extract bigrams from the corpus and visualize top bigrams

bigrams <- corpus_tokens %>%
  filter(!is.na(lead(word))) %>%
  mutate(bigram = paste(word, lead(word), sep = " ")) %>%
  count(bigram, sort = TRUE)

top_bigrams <- bigrams %>%
  filter(n > 10) %>%
  slice_max(order_by = n, n = 10)

ggplot(top_bigrams, aes(x = reorder(bigram, n), y = n)) +
  geom_col() +
  labs(title = "Top Bigrams",
       x = "Bigram", y = "Frequency") +
  theme_minimal() +
  coord_flip()