exercise_questions.html

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />


<meta name="author" content="Benedict Anderer, Farhan Shaikh, and Saurav Jha" />

<meta name="date" content="2024-10-25" />

<title>String processing in R</title>

<script src="exercise_questions_files/header-attrs-2.20/header-attrs.js"></script>
<script src="exercise_questions_files/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="exercise_questions_files/bootstrap-3.3.5/css/lumen.min.css" rel="stylesheet" />
<script src="exercise_questions_files/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="exercise_questions_files/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="exercise_questions_files/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
       h1.title {font-size: 38px;}
       h2 {font-size: 30px;}
       h3 {font-size: 24px;}
       h4 {font-size: 18px;}
       h5 {font-size: 16px;}
       h6 {font-size: 12px;}
       code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
       pre:not([class]) { background-color: white }</style>
<script src="exercise_questions_files/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="exercise_questions_files/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="exercise_questions_files/tocify-1.9.1/jquery.tocify.js"></script>
<script src="exercise_questions_files/navigation-1.1/tabsets.js"></script>

<style type="text/css">
  code{white-space: pre-wrap;}
  span.smallcaps{font-variant: small-caps;}
  span.underline{text-decoration: underline;}
  div.column{display: inline-block; vertical-align: top; width: 50%;}
  div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
  ul.task-list{list-style: none;}
    </style>


<link rel="stylesheet" href="custom.css" type="text/css" />


<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
img {
  max-width:100%;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
details > summary > p:only-child {
  display: inline;
}
pre code {
  padding: 0;
}
</style>


<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "\e259";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "\e258";
  font-family: 'Glyphicons Halflings';
  border: none;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
  background-color: transparent;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<!-- code folding -->


<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}

@media print {
.toc-content {
  /* see https://github.com/w3c/csswg-drafts/issues/4434 */
  float: right;
}
}

.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>


</head>

<body>


<div class="container-fluid main-container">


<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">


<div id="header">


<h1 class="title toc-ignore">String processing in R</h1>
<h3 class="subtitle">The package <strong>stringi</strong></h3>
<h4 class="author">Benedict Anderer, Farhan Shaikh, and Saurav Jha</h4>
<h4 class="date">2024-10-25</h4>

</div>


<div id="welcome-to-our-workshop" class="section level1">
<h1>🎉 WELCOME TO OUR WORKSHOP ! 🎉</h1>
<div id="workshop-overview" class="section level3">
<h3>Workshop Overview 📝</h3>
<p><strong>Exercise 1</strong>: Dive into <strong>stringi</strong> for
hands-on experience in string data cleaning and manipulation! In this
exercise, we will explore several core functions of
<strong>stringi</strong> that simplify text processing and extraction.
Specifically, we will focus on the following features:</p>
<ul>
<li><strong>Normalization</strong> ✍️: Standardizing text for
consistency.</li>
<li><strong>Sorting</strong> 📊: Organizing strings in a specified
order.</li>
<li><strong>Joining</strong> 🔗: Concatenating multiple strings
seamlessly.</li>
<li><strong>Formatting</strong> 🎨: Adjusting string representation to
meet specific needs.</li>
<li><strong>Counting</strong> 🔢: Determining the frequency of elements
within strings.</li>
<li><strong>Pattern Searching</strong> 🔍: Finding specific sequences
within the text.</li>
<li><strong>Locating and Extracting Patterns</strong> 📌: Identifying
and retrieving desired segments of text.</li>
<li><strong>Splitting</strong> ✂️: Breaking down strings into manageable
components.</li>
</ul>
<p><strong>Exercise 2</strong>: An interesting exercise where we have
scraped Berlin’s climate data from its wikipedia page and prepared it
for analysis. In this exercise , you’ll see how <strong>stringi</strong>
can be creatively applied for <strong>scraping and cleaning numeric
data</strong> by treating numbers as strings—a powerful trick in data
pre-processing. 🧹 Exercise 2 exist on the Github repository as
additional exercise.</p>
</div>
<div
id="this-workshop-will-show-you-the-versatility-and-power-of-stringi."
class="section level2">
<h2>This workshop will show you the versatility and power of
<strong>stringi</strong>. 💡</h2>
<div id="lets-begin" class="section level3">
<h3>Let’s Begin! 🚀</h3>
</div>
<div id="exercise-1" class="section level3">
<h3>Exercise 1</h3>
<p><strong>Exercise 1.1 </strong>: Scrape political slogans on this
wikipedia page (<a
href="https://en.wikipedia.org/wiki/List_of_political_slogans"
class="uri">https://en.wikipedia.org/wiki/List_of_political_slogans</a>)
along with their descriptions.</p>
<p><strong>Exercise 1.2 </strong>:<strong>Transliteration</strong>-
Convert non-ASCII characters in the slogan to their closest ASCII</p>
<p><strong>Exercise 1.3 </strong>: <strong>Sorting</strong>- Sort these
slogans alphabetically</p>
<p><strong>Exercise 1.4 </strong>:<strong>Joining or
Concatenation</strong>- Create a variable concatenated_slogans of single
character string with each elements separated by “|”</p>
<p><strong>Exercise 1.5 </strong>:<strong>Padding/Formatting</strong>-
Create a variable padded_slogans to right pad each slogans with spaces
so that they all have width of 150 characters.</p>
<p><strong>Exercise 1.6 </strong>:<strong>Case Change</strong>-Change
the case of all the slogans to uppercase</p>
<p><strong>Exercise 1.7 </strong>:<strong>String Counting</strong>-
Count the number of times the word “power” exist in these slogans,
ignore the case.</p>
<p><strong>Exercise 1.8 </strong>:<strong>String detection</strong>-
Extract the slogans that has the word “Power” in it.</p>
<p><strong>Exercise 1.8 </strong>: <strong>Pattern Searching</strong>-
Extract and print the slogans that has the word “Power” or “People” in
it.</p>
<p><strong>Exercise 1.8 </strong>:<strong>String Detection with
Pattern</strong>- Extract and print the slogans that has both the word
“Power” and “People” in it.</p>
<p><strong>Exercise 1.8 </strong>: <strong>Looking behind</strong>- Look
for the slogans where word “Black” precedes the word “Power”</p>
<p><strong>Exercise 1.9 </strong>: <strong>Looking ahead</strong>-Find
the slogans in which the word “to the people” appears after power.</p>
<p><strong>Exercise 1.10 </strong>: <strong>String
trimming</strong>-Trim any specific (either slogan or their description)
part of the slogan.</p>
<p><strong>Exercise 1.11 </strong>:<strong>String split</strong>- Create
a dataframe with two columns- slogans and their description in it</p>
</div>
</div>
</div>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- tabsets -->

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open');
  });
});
</script>

<!-- code folding -->

<script>
$(document).ready(function ()  {

    // temporarily add toc-ignore selector to headers for the consistency with Pandoc
    $('.unlisted.unnumbered').addClass('toc-ignore')

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_');
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>