In this project, we will analyze a dataset composed by job postings in order to discover useful information regarding a specific job title through various text analytics techniques. The job title and the location associated with it, used to gather the job posting descriptions, were Data Scientist and Los Angeles respectively. All job postings were filtered by date: January 2019. Techniques used were tokenization, stemming, lemmatization, n-gram, and so on.
The main purpose of this project is to gain insights that can help job applicants to be familiar with what the organizations are looking for in terms of data scientist positions located in Los Angeles area.
Loading libraries for this project life cycle: data cleaning, processing & visualization…
# There are a range of packages in R for doing text analysis. These include:
#
# hunspell - Tools for spelling, stemming, and tokenization.
# SnowballC - Tools for stemming in a range of languages.
# stringr - Tools for cleaning individual strings (e.g. trimming whitespace).
# text2vec - Tools for tools for text vectorization, topic modeling, and more.
# tidytext - Tools for word processing, sentiment analysis, and tidying text.
# tm - A framework for text mining.
Sys.setenv(JAVA_HOME='C:\\Program Files (x86)\\Java\\jre1.8.0_45') # for 32-bit version
library(tm)
library(tmap)
library(SnowballC)
library(ggplot2)
library(DT)
library(plyr)
library(tibble)
library(RColorBrewer)
library(wordcloud)
library(stringr)
library(textstem)
library(RWeka)
library(qdap)
library(RColorBrewer)
library(udpipe)
library(lattice)
library(filematrix)
Loading Job posting dataset…
# Setting my working directory in R
setwd("C:/Saul/Portfolio/Text Analytics")
# Reading the dataset
data.text <- read.csv("DataScientistjobs.csv")
data.tools <- read.csv("Tools.csv")
#yy <- matrix(t(as.matrix(as.data.frame(t(data.tools)))), nrow = 1)
# Data structure review
# str(data.text)
# Data preview
Checking a sample data…
datatable(data.text[(1:20),], filter = 'top', options = list(
pageLength = 5, scrollX = TRUE, scrollY = "300px", autoWidth = TRUE))
The next step is to transform the dataset into a text document using the function Corpus that belongs to the TM package. Corpus is a function that takes a text document as input and returns a text document (a character vector). Read more about TM.
# Create corpus
corpus = Corpus(VectorSource(data.text$Description))
# Look at corpus
print(corpus)
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 43
Once we have a corpus, then we need to modify the documents in it, e.g., puntuaction removal, stemming, stopword removal,* etcetera.
cleanCorpus <- function(corpus){
# Converting to lower-case
corpus.cleaned <- tm_map(corpus, tolower)
# Removing punctuation
corpus.cleaned <- tm_map(corpus.cleaned, removePunctuation)
# Looking at stop words
v_stopwords <- c(stopwords("en"),"will","etc","build","using","usemploymentcompliancecgicom","unless","vary","reason","routed","recruited")
corpus.cleaned <- tm_map(corpus.cleaned, removeWords,v_stopwords )
#Eliminating extra whitespace
corpus.cleaned <- tm_map(corpus.cleaned, stripWhitespace)
#Removing numbers
corpus.cleaned <- tm_map(corpus.cleaned, removeNumbers)
#Stemming/Lemmatizing document
#corpus.cleaned <- tm_map(corpus.cleaned, lemmatize_strings,v_stopwords)
#Removing special characters
corpus.cleaned <- tm_map(corpus.cleaned, str_replace_all,"[^[:alnum:]]", " ")
#corpus[[1]]$content
return(corpus.cleaned)
}
corpus <- cleanCorpus(corpus)
Now let’s create a Term Document Matrix (TDM). A term-document matrix is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. This video concisely explains what TDM is.
Also, let’s inspect 10 terms that belong to 10 documents.
#convert to document term matrix
corpustdm <- TermDocumentMatrix(corpus)
#print(corpustdm)
#print(dimnames(corpustdm)$Terms)
#dim(corpustdm)
inspect(corpustdm[1:10,1:10])
## <<TermDocumentMatrix (terms: 10, documents: 10)>>
## Non-/sparse entries: 24/76
## Sparsity : 76%
## Maximal term length: 10
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1 10 2 3 4 5 6 7 8 9
## able 4 0 0 0 0 0 1 0 0 0
## activities 1 0 0 1 0 0 0 0 2 0
## alerts 1 0 0 0 0 0 0 0 0 0
## allow 1 0 0 0 0 0 0 0 0 0
## also 1 0 1 1 0 0 3 1 0 0
## analysis 5 3 0 2 2 0 1 0 1 3
## anova 1 0 0 0 0 0 0 0 0 0
## app 3 0 0 0 0 0 0 0 0 0
## apple 1 0 0 0 0 0 0 0 0 0
## applying 1 0 0 0 0 0 0 1 0 0
Checking the number of terms.
#collapse matrix by summing over columns
set.seed(43)
colS <- colSums(as.matrix(t(corpustdm)))
#total number of terms
length(colS)
## [1] 2895
# create sort order (asc)
ord <- order(colS,decreasing=TRUE)
Inspecting most frequently ocurring terms.
colS[head(ord)]
## data experience learning business science machine
## 491 233 144 143 121 114
Inspecting least frequently ocurring terms.
colS[tail(ord)]
## perceived proceeding recruiter reference returned
## 1 1 1 1 1
## transitioning
## 1
Listing terms that appear at least 50 times.
findFreqTerms(corpustdm, lowfreq=50)
## [1] "analysis" "business" "data" "experience" "learning"
## [6] "machine" "models" "python" "science" "statistical"
## [11] "team" "work" "years" "analytics" "degree"
## [16] "scientist" "solutions" "develop" "engineering" "skills"
## [21] "ability" "working"
Let’s take some of the above terms and check if there is a correlation with other terms having a correlation coefficient of at least 70%.
findAssocs(corpustdm, "analysis",0.7)
## $analysis
## presentations
## 0.78
findAssocs(corpustdm, "degree",0.7)
## $degree
## serve economic computer gender
## 0.76 0.75 0.73 0.70
m <- as.matrix(corpustdm)
corpusdf <- data.frame(m)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
corpusdf <- rownames_to_column(corpusdf)
corpusdf <- rename(corpusdf, c("rowname"="word"))
corpusdf$wordcount <- rowSums(corpusdf[-1])
corpusdf <- corpusdf[order(-corpusdf$wordcount),]
#plotting histogram
ggplot(subset(corpusdf, wordcount>43), aes(reorder(word, -wordcount), wordcount)) +
geom_bar(stat = "identity", fill = "darkgreen", alpha = 0.7) +
xlab("Words with the highest frequency") +
ylab("Frequency") +
ggtitle("Understanding word frequencies") +
theme_bw() +
theme(axis.text.x=element_text(angle=60, hjust=0.9), plot.title = element_text(hjust=0.5)) +
scale_fill_brewer()
pal <- colorRampPalette(colors = c("darkgreen", "lightgreen"))(10)
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
main ="Top 10 Most Frequent Words",
ylab = "Frequency", col = pal, border = NA)
# wordcloud(names(colS), colS, colors = brewer.pal(6, 'Dark2'),random.order=FALSE, rot.per= 0.35, max.words = 100)
#since all the rows are same and numeric,we can add them up to get the total value
#sort it based on the number
pal <- brewer.pal(9,"RdYlGn")
pal <- pal[-(1:2)]
set.seed(142)
wordcloud(word=corpusdf$word, freq= corpusdf$wordcount, colors = brewer.pal(6, "Dark2"), random.order=FALSE, rot.per= 0.35, max.words = 150)
#ud_model <- udpipe_download_model(language = "english")
#ud_model <- udpipe_load_model(ud_model$file_model)
ud_model <- udpipe_load_model(file = "english-ewt-ud-2.3-181115.udpipe")
x <- udpipe_annotate(ud_model, x = data.text$Description, doc_id = data.text$Company)
x <- as.data.frame(x)
stats <- txt_freq(x$upos)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = stats, col = "lightgreen",
main = "UPOS (Universal Parts of Speech)\n frequency of occurrence",
xlab = "Freq")
## ADJECTIVES
stats <- subset(x, upos %in% c("ADJ"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "purple",
main = "Most occurring adjectives", xlab = "Freq")
## NOUNS
stats <- subset(x, upos %in% c("VERB"))
stats <- txt_freq(stats$token)
stats$key <- factor(stats$key, levels = rev(stats$key))
barchart(key ~ freq, data = head(stats, 20), col = "gold",
main = "Most occurring Verbs", xlab = "Freq")
Rapid Automatic Keyword Extraction (RAKE) is an algorithm to automatically extract keywords from documents. More info here.
## Using RAKE
#Rapid Automatic Keyword Extraction (RAKE) is an algorithm to automatically extract keywords from documents.
#More info on https://www.thinkinfi.com/2018/09/keyword-extraction-using-rake-in-python.html
stats <- keywords_rake(x = x, term = "lemma", group = "doc_id",
relevant = x$upos %in% c("NOUN", "ADJ"))
stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "red",
main = "Keywords identified by RAKE",
xlab = "Rake")
# ## Using a sequence of POS tags (noun phrases / verb phrases)
# x$phrase_tag <- as_phrasemachine(x$upos, type = "upos")
# stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token),
# pattern = "(A|N)*N(P+D*(A|N)*N)*",
# is_regex = TRUE, detailed = FALSE)
# stats <- subset(stats, ngram > 1 & freq > 3)
# stats$key <- factor(stats$keyword, levels = rev(stats$keyword))
# barchart(key ~ freq, data = head(stats, 20), col = "magenta",
# main = "Keywords - simple noun phrases", xlab = "Frequency")
#
library(dplyr)
library(tidytext)
# Define bigram & trigram tokenizer
tokenizer_bi <- function(x){
NGramTokenizer(x, Weka_control(min=2, max=2))
}
tokenizer_tri <- function(x){
NGramTokenizer(x, Weka_control(min=3, max=3))
}
# Text transformations
cleanVCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
v_stopwords <- c(stopwords("en"),"will","etc","build","using")
corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
return(corpus.tmp)
}
# Most frequent bigrams
frequentBigrams <- function(text){
s.cor <- VCorpus(VectorSource(text))
s.cor.cl <- cleanVCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer_bi))
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# Most frequent bigrams
ep4.bigrams <- frequentBigrams(data.text$Description)[1:20,]
ggplot(data=ep4.bigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="chocolate2", colour="black") +
theme_bw() +
theme(axis.text.x=element_text(angle=60, hjust=1)) +
labs(x="Bigram", y="Frequency")
# Most frequent trigrams
frequentTrigrams <- function(text){
s.cor <- VCorpus(VectorSource(text))
s.cor.cl <- cleanVCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl, control=list(tokenize=tokenizer_tri))
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}
# Most frequent trigrams
ep4.Trigrams <- frequentTrigrams(data.text$Description)[1:20,]
ggplot(data=ep4.Trigrams, aes(x=reorder(word, -freq), y=freq)) +
geom_bar(stat="identity", fill="midnightblue", colour="black") +
theme_bw() +
theme(axis.text.x=element_text(angle=60, hjust=1)) +
labs(x="Trigram", y="Frequency")
uniqueWords = function(text) {
text <- strsplit(text, " |,|/|;")
text <- lapply(text,unique)
text <- sapply(text, function(u) paste0(u, collapse = " "))
return(text)
}
corpus = tm_map(corpus, content_transformer(uniqueWords))
corpustdm_all <- as.matrix(TermDocumentMatrix(corpus,control=list(wordLengths=c(1,Inf))))
freq <- rowSums(as.matrix(corpustdm_all))
#freq[ as.character(tolower(data.tools[,])) ]
m <- as.matrix(freq[ as.character(tolower(data.tools[,])) ])
n <- as.data.frame(rownames(m))
colnames(n) <- c("toolname")
n$frequency <- round(as.numeric(as.character(m[,1]))/ncol(corpustdm_all),2)
n <- na.omit(n)
#n <- subset(n,frequency>0.1)
n <- head(n[order(n$frequency, decreasing= T),], n = 20)
mycolors = colorRampPalette(brewer.pal(name="Blues", n = 12))(20)
#mycolors = c(brewer.pal(name="Dark2", n = 8), brewer.pal(name="Paired", n = 7))
ggplot(data=n, aes(x= reorder(n$toolname,-frequency), y=n$frequency, fill = n$toolname)) +
geom_bar(stat="identity")+
scale_y_continuous(labels = scales::percent) +
geom_text(aes(label = paste0(frequency*100,"%")),
position = position_stack(vjust = 0.5), size = 3) +
scale_color_manual(values = mycolors) +
theme(legend.position="none") +
labs(x = "Tools", y = "Percent") +
ggtitle("Top 20 Tools in Data Scientist Job Listings") +
theme(plot.title = element_text(hjust=0.5)) +
ggpubr::rotate_x_text()
Although is very well known that coding and visualization tools knowledge are fundamental to supports data scientists in their performance of tasks, results demonstrates that Big Data tools are gaining ground for virtually every data scientist position. Therefore, Big Data is not anymore only associated to Data Engineers roles exclusively.
# 5.3 Plot the model
Please review the post from javinpaul.