@Article{WatanabeFeCaSoCaVi:2020:ReEfSo,
author = "Watanabe, Willian Massami and Felizardo, Katia Romero and Candido
J{\'u}nior, Arnaldo Candido and Souza, {\'E}rica Ferreira de and
Campos Neto, Jos{\'e} Ede de and Vijaykumar, Nandamudi
Lankalapalli",
affiliation = "{Universidade Tecnol{\'o}gica Federal do Paran{\'a} (UTFPR)} and
{Universidade Tecnol{\'o}gica Federal do Paran{\'a} (UTFPR)} and
{Universidade Tecnol{\'o}gica Federal do Paran{\'a} (UTFPR)} and
{Universidade Tecnol{\'o}gica Federal do Paran{\'a} (UTFPR)} and
{Universidade Tecnol{\'o}gica Federal do Paran{\'a} (UTFPR)} and
{Instituto Nacional de Pesquisas Espaciais (INPE)}",
title = "Reducing efforts of software engineering systematic literature
reviews updates using text classification",
journal = "Information and Software Technology",
year = "2020",
volume = "128",
pages = "e106395",
month = "Dec.",
keywords = "Systematic literature review SLR Automatic selection Review update
Text classification Document classification Text categorization.",
abstract = "Context: Systematic Literature Reviews (SLRs) are frequently used
to synthesize evidence in Software Engineering (SE), however
replicating and keeping SLRs up-to-date is a major challenge. The
activity of studies selection in SLR is labor intensive due to the
large number of studies that must be analyzed. Different
approaches have been investigated to support SLR processes, such
as: Visual Text Mining or Text Classification. But acquiring the
initial dataset is time-consuming and labor intensive. Objective:
In this work, we proposed and evaluated the use of Text
Classification to support the studies selection activity of new
evidences to update SLRs in SE. Method: We applied Text
Classification techniques to investigate how effective and how
much effort could be spared during the studies selection phase of
an SLR update. Considering the SLRs update scenario, the studies
analyzed in the primary SLR could be used as a classified dataset
to train Supervised Machine Learning algorithms. We conducted an
experiment with 8 Software Engineering SLRs. In the experiments,
we investigated the use of multiple preprocessing and feature
extraction tasks such as tokenization, stop words removal, word
lemmatization, TF-IDF (Term-Frequency/Inverse-Document-Frequency)
with Decision Tree and Support Vector Machines as classification
algorithms. Furthermore, we configured the classifier activation
threshold for maximizing Recall, hence reducing the number of
Missed selected studies. Results: The techniques accuracies were
measured and the results achieved on average a F-Score of 0.92 and
62% of exclusion rate when varying the activation threshold of the
classifiers, with a 4% average number of Missed selected studies.
Both the Exclusion rate and number of Missed selected studies were
significantly different when compared to classifier which did not
use the configuration of the activation threshold. Conclusion: The
results showed the potential of the techniques in reducing the
effort required of SLRs updates.",
doi = "10.1016/j.infsof.2020.106395",
url = "http://dx.doi.org/10.1016/j.infsof.2020.106395",
issn = "0950-5849",
language = "en",
targetfile = "watanabe_reducing.pdf",
urlaccessdate = "25 abr. 2024"
}