pyt2~20171216-135827.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Building a simple word count application with Spark\n",
  8. "\n",
  9. "This lab will build on the techniques covered in the first Spark workshop. We will develop a simple word count application of the most common words in the [Complete Works of William Shakespeare](http://www.gutenberg.org/ebooks/100) retrieved from [Project Gutenberg](http://www.gutenberg.org/wiki/Main_Page). \n",
  10. "\n",
  11. "This lab is mandatory for Workshop 2 and required to validate your registration. \n",
  12. "\n",
  13. "####Read-me before:\n",
  14. "You must execute each cell and fill with the appropriate code when necessary.\n",
  15. "At the end of the notebook, there is a generated code to be copied and pasted into the meetup registration. "
  16. ]
  17. },
  18. {
  19. "cell_type": "markdown",
  20. "metadata": {},
  21. "source": [
  22. "Setup import and functions"
  23. ]
  24. },
  25. {
  26. "cell_type": "code",
  27. "execution_count": null,
  28. "metadata": {
  29. "collapsed": false
  30. },
  31. "outputs": [],
  32. "source": [
  33. "# Just excecute this cell\n",
  34. "import os.path\n",
  35. "import re\n",
  36. "import hashlib"
  37. ]
  38. },
  39. {
  40. "cell_type": "markdown",
  41. "metadata": {},
  42. "source": [
  43. "Loads the [Complete Works of William Shakespeare](http://www.gutenberg.org/ebooks/100) retrieved from [Project Gutenberg](http://www.gutenberg.org/wiki/Main_Page)."
  44. ]
  45. },
  46. {
  47. "cell_type": "code",
  48. "execution_count": null,
  49. "metadata": {
  50. "collapsed": false
  51. },
  52. "outputs": [],
  53. "source": [
  54. "# Just excecute this cell\n",
  55. "baseDir = os.path.join('data')\n",
  56. "inputPath = os.path.join('shakespeare.txt')\n",
  57. "fileName = os.path.join(baseDir, inputPath)\n",
  58. "\n",
  59. "shakespeareRDD = (sc\n",
  60. " .textFile(fileName, 8))\n",
  61. "\n",
  62. "shakespeareRDD.cache()\n",
  63. "print '\\n'.join(shakespeareRDD\n",
  64. " .zipWithIndex() # to (line, lineNum)\n",
  65. " .map(lambda (l, num): '{0}: {1}'.format(num, l)) # to 'lineNum: line'\n",
  66. " .take(15))"
  67. ]
  68. },
  69. {
  70. "cell_type": "code",
  71. "execution_count": null,
  72. "metadata": {
  73. "collapsed": false
  74. },
  75. "outputs": [],
  76. "source": [
  77. "# Just excecute this cell\n",
  78. "def toLower(text):\n",
  79. " \"\"\"\n",
  80. " Changes all text to lower case.\n",
  81. " \"\"\"\n",
  82. " return text.lower()\n",
  83. "\n",
  84. "print toLower('Hello WORLD') #should be \"hello world\""
  85. ]
  86. },
  87. {
  88. "cell_type": "markdown",
  89. "metadata": {},
  90. "source": [
  91. "#### Define the function `removePunctuation` removes any punctuation. We use the Python [re](https://docs.python.org/2/library/re.html) module to remove any text that is not a letter, number, or space."
  92. ]
  93. },
  94. {
  95. "cell_type": "code",
  96. "execution_count": null,
  97. "metadata": {
  98. "collapsed": false
  99. },
  100. "outputs": [],
  101. "source": [
  102. "# Just excecute this cell\n",
  103. "pattern=re.compile(\"[^a-zA-Z0-9\\s]\")\n",
  104. "def removePunctuation(text):\n",
  105. " \"\"\"Removes punctuation from the given text\n",
  106. "\n",
  107. " Note:\n",
  108. " Only spaces, letters, and numbers should be retained. Other characters should should be\n",
  109. " eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after\n",
  110. " punctuation is removed.\n",
  111. "\n",
  112. " Args:\n",
  113. " text (str): A string.\n",
  114. "\n",
  115. " Returns:\n",
  116. " str: The cleaned up string.\n",
  117. " \"\"\"\n",
  118. " cleanText = pattern.sub('', text)\n",
  119. " return cleanText\n",
  120. "print removePunctuation('Hi, you! My ZIP code is 98-9800') #should be Hi you My ZIP code is 989800\n",
  121. "print removePunctuation('No under_score!') #No underscore"
  122. ]
  123. },
  124. {
  125. "cell_type": "code",
  126. "execution_count": null,
  127. "metadata": {
  128. "collapsed": false
  129. },
  130. "outputs": [],
  131. "source": [
  132. "# Just excecute this cell\n",
  133. "def strips(text):\n",
  134. " \"\"\"strips leading and trailing spaces.\n",
  135. " \"\"\"\n",
  136. " return text.strip()\n",
  137. "print '>%s<' % strips(' This is a text') #should print >This is a text<\n",
  138. "print '>%s<' % (strips(removePunctuation('No under_score !'))) #should print >No underscore<"
  139. ]
  140. },
  141. {
  142. "cell_type": "code",
  143. "execution_count": null,
  144. "metadata": {
  145. "collapsed": false
  146. },
  147. "outputs": [],
  148. "source": [
  149. "# Just excecute this cell\n",
  150. "stopfile = os.path.join(baseDir, 'stopwords.txt')\n",
  151. "stopwords = set(sc.textFile(stopfile).collect())\n",
  152. "print 'These are the stopwords: %s' % stopwords"
  153. ]
  154. },
  155. {
  156. "cell_type": "code",
  157. "execution_count": null,
  158. "metadata": {
  159. "collapsed": false
  160. },
  161. "outputs": [],
  162. "source": [
  163. "# Just excecute this cell\n",
  164. "def isNotStopWord(word):\n",
  165. " \"\"\" Tells if the given word isn't a English common word.\n",
  166. " Args:\n",
  167. " string (str): input string\n",
  168. " Returns:\n",
  169. " Boolean: True if word isn't a stopword. Otherwise, False\n",
  170. " \"\"\"\n",
  171. " return word not in stopwords\n",
  172. "\n",
  173. "print isNotStopWord('brown') # Should give True\n",
  174. "print isNotStopWord('the') # Should give False"
  175. ]
  176. },
  177. {
  178. "cell_type": "markdown",
  179. "metadata": {},
  180. "source": [
  181. "#### wordCount` function **\n",
  182. "#### First, define a function for word counting. You should reuse the techniques that have been covered during the first workshop. This function should take in an RDD that is a list of words and return a pair RDD that has all of the words and their associated counts."
  183. ]
  184. },
  185. {
  186. "cell_type": "code",
  187. "execution_count": null,
  188. "metadata": {
  189. "collapsed": false
  190. },
  191. "outputs": [],
  192. "source": [
  193. "# TODO: Replace <FILL IN> with appropriate code\n",
  194. "def wordCount(wordListRDD):\n",
  195. " \"\"\"Creates a pair RDD with word counts from an RDD of words.\n",
  196. " Args:\n",
  197. " wordListRDD (RDD of str): An RDD consisting of words.\n",
  198. "\n",
  199. " Returns:\n",
  200. " RDD of (str, int): An RDD consisting of (word, count) tuples.\n",
  201. " \"\"\"\n",
  202. " return <FILL IN>"
  203. ]
  204. },
  205. {
  206. "cell_type": "markdown",
  207. "metadata": {},
  208. "source": [
  209. "#### Before you can use the `wordcount()` function, you have to address two issues with the format of the RDD:\n",
  210. " + #### The first issue is that that we need to split each line by its spaces.\n",
  211. " + #### The second issue is we need to filter out empty lines.\n",
  212. " \n",
  213. "#### Apply a transformation that will split each element of the RDD by its spaces. You might think that a `map()` transformation is the way to do this, but think about what the result of the `split()` function will be."
  214. ]
  215. },
  216. {
  217. "cell_type": "code",
  218. "execution_count": null,
  219. "metadata": {
  220. "collapsed": false
  221. },
  222. "outputs": [],
  223. "source": [
  224. "# TODO: Replace <FILL IN> with appropriate code\n",
  225. "cleanRDD = (shakespeareRDD\n",
  226. " .map(removePunctuation)\n",
  227. " .map(toLower)\n",
  228. " .map(strips)\n",
  229. " .<FILL IN>(lambda line: line.split(' '))\n",
  230. " .filter(<FILL IN>)\n",
  231. " .filter(isNotStopWord))"
  232. ]
  233. },
  234. {
  235. "cell_type": "markdown",
  236. "metadata": {},
  237. "source": [
  238. "#### You now have an RDD that is only words. Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the top 15 words by using the `takeOrdered()` action; however, since the elements of the RDD are pairs, we need a custom sort function that sorts using the value part of the pair.\n",
  239. "\n",
  240. "#### Use the `wordCount()` function and `takeOrdered()` to obtain the fifteen most common words and their counts."
  241. ]
  242. },
  243. {
  244. "cell_type": "code",
  245. "execution_count": null,
  246. "metadata": {
  247. "collapsed": false
  248. },
  249. "outputs": [],
  250. "source": [
  251. "#collect the top 15\n",
  252. "top15WordsAndCounts = wordCount(cleanRDD).<FILL IN>\n",
  253. "print '\\n'.join(map(lambda (w, c): '{0}: {1}'.format(w, c), top15WordsAndCounts))"
  254. ]
  255. },
  256. {
  257. "cell_type": "markdown",
  258. "metadata": {},
  259. "source": [
  260. "####Generate the md5 code to validate your registration"
  261. ]
  262. },
  263. {
  264. "cell_type": "code",
  265. "execution_count": null,
  266. "metadata": {
  267. "collapsed": false
  268. },
  269. "outputs": [],
  270. "source": [
  271. "md5_code = hashlib.md5()\n",
  272. "for (word, count) in top15WordsAndCounts:\n",
  273. " md5_code.update(word)\n",
  274. "\n",
  275. "meetup_code = md5_code.hexdigest()\n",
  276. "if hashlib.sha224(meetup_code).hexdigest() == '427681d5929a35ab878c291b0de5f4b8a009dc9b71d2e54dbf7c46ba':\n",
  277. " print 'Well done, copy this code: %s' % md5_code.hexdigest()\n",
  278. "else:\n",
  279. " print 'This is not the expected code, please try again. \\nTip: the code starts with \"cc\" and finishes with \"ad1c\"'"
  280. ]
  281. }
  282. ],
  283. "metadata": {
  284. "kernelspec": {
  285. "display_name": "Python 2",
  286. "language": "python",
  287. "name": "python2"
  288. },
  289. "language_info": {
  290. "codemirror_mode": {
  291. "name": "ipython",
  292. "version": 2
  293. },
  294. "file_extension": ".py",
  295. "mimetype": "text/x-python",
  296. "name": "python",
  297. "nbconvert_exporter": "python",
  298. "pygments_lexer": "ipython2",
  299. "version": "2.7.13"
  300. }
  301. },
  302. "nbformat": 4,
  303. "nbformat_minor": 0
  304. }