-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAlgorithm Execution Container-Zeppelin Version.json
1 lines (1 loc) · 236 KB
/
Algorithm Execution Container-Zeppelin Version.json
1
{"paragraphs":[{"text":"z.angularBind(\"result\", false)\nz.angularBind(\"error\", false)\n\nprintln(\"\"\"%angular <div class=\"container\">\n\n <h1>Set Analysis</h1>\n <form>\n <div class=\"row\">\n <div class=\"col-sm-4\">\n <h3>Set Dataset Path and Separator</h3>\n </div>\n <div class=\"form-group col-sm-6\" style=\"margin-top: 20px;\">\n <input type=\"text\" class=\"form-control\" id=\"datasetPath\" ng-model=\"datasetPath\" placeholder=\"Path\">\n </div>\n <div class=\"form-group col-sm-2\" style=\"margin-top: 20px;\">\n <input type=\"text\" class=\"form-control\" id=\"separator\" ng-model=\"datasetSeparator\" placeholder=\"Delimiter\">\n </div>\n </div>\n <br>\n <div class=\"row\">\n <div class=\"col-sm-4\">\n <h3>Set Output Path</h3>\n </div>\n <div class=\"form-group col-sm-12\" style=\"margin-top: 20px;\">\n <input type=\"text\" class=\"form-control\" id=\"datasetOutputPath\" ng-model=\"datasetOutputPath\" placeholder=\"Output Path without filename (append slash ( / ) in the end)\">\n </div>\n </div>\n <br>\n <div class=\"row\">\n <div class=\"col-sm-4\">\n <h3>Set Analysis Type</h3>\n </div>\n <div class=\"form-group col-sm-4\">\n <select id=\"algorithmFamily\" name=\"algorithmFamily\" ng-options=\"x.name for x in algorithmFamilies\"\n ng-model=\"selectedAlgorithmFamily\" class=\"form-control\" style=\"margin-top: 17px; margin-bottom: 10px;\">\n <option value=\"\" disabled selected>Algorithm Family</option>\n </select>\n </div>\n <div class=\"form-group col-sm-4\">\n <select id=\"algorithm\" name=\"algorithm\" ng-options=\"x.name for x in algorithmFamilies[selectedAlgorithmFamily.id].algorithms\"\n ng-model=\"selectedAlgorithm\" class=\"form-control\" style=\"margin-top: 17px; margin-bottom: 10px;\">\n <option value=\"\" disabled selected>Algorithm</option>\n </select>\n </div>\n </div>\n <br>\n <div class=\"row\">\n <div class=\"col-sm-12\">\n <div class=\"well\">\n <h4 ng-if=\"selectedAlgorithm !== undefined\" ng-bind-html=\"selectedAlgorithm.information\"></h4>\n <h4 ng-if=\"selectedAlgorithm == undefined\" style=\"color: darkgray\">Algorithm Information</h4>\n </div>\n </div>\n </div>\n <br>\n <br>\n <div class=\"row\" ng-if=\"selectedAlgorithm !== undefined && selectedAlgorithm.parameters.length > 0\">\n <div class=\"col-sm-12\">\n <h3>Set Configuration for Analysis</h3>\n </div>\n </div>\n <div class=\"row\" ng-if=\"selectedAlgorithm !== undefined && selectedAlgorithm.parameters.length > 0\">\n <br>\n <div class=\"form-horizontal col-sm-8\" ng-repeat=\"parameter in selectedAlgorithm.parameters\">\n <div class=\"form-group\">\n <label for=\"exampleInputName1\" class=\"col-sm-2 control-label\">{{ parameter.name }} ({{ parameter.kind }})</label>\n <div class=\"col-sm-7\">\n <input type=\"text\" class=\"form-control\" id=\"exampleInputName1\" ng-model=\"parameter.value\">\n </div>\n <div class=\"col-sm-1\">\n <i class=\"fa fa-info-circle\" style=\"font-size: 20px; margin-top: 7px;\" data-toggle=\"tooltip\"\n data-placement=\"right\" title=\"{{ parameter.information }}\"></i>\n </div>\n </div>\n </div>\n </div>\n <br>\n <div class=\"row\" ng-show=\"selectedAlgorithm !== null && selectedAlgorithm !== undefined && selectedAlgorithmFamily.id === 4\">\n <div class=\"col-sm-4\">\n <h3>Evaluation Methods</h3>\n </div>\n <div class=\"form-group col-sm-4\">\n <select id=\"evaluationMethod1\" name=\"evaluationMethod1\" ng-options=\"method.name for method in evaluationMethodFamily.methods\"\n ng-model=\"selectedEvalMeth1\" class=\"form-control\" style=\"margin-top: 17px; margin-bottom: 10px;\">\n <option value=\"\" disabled selected>Evaluation Method</option>\n </select>\n </div>\n \n <br>\n <div class=\"form-horizontal col-sm-8\" ng-repeat=\"parameter in selectedEvalMeth1.parameters\">\n <div class=\"form-group\">\n <label for=\"exampleInputName2\" class=\"col-sm-2 control-label\">{{ parameter.information }}</label>\n <div class=\"col-sm-7\">\n <input type=\"text\" class=\"form-control\" id=\"exampleInputName2\" ng-model=\"parameter.value\">\n </div>\n </div>\n </div>\n </div>\n <div class=\"row\">\n <br>\n <div class=\"form-group col-sm-7 text-center\">\n <button class=\"btn btn-primary text-center\" style=\"width:200px;\" ng-click=\"z.angularBind('datasetPath', datasetPath, '20180221-124127_25791550'); z.angularBind('datasetSeparator', datasetSeparator, '20180221-124127_25791550'); z.angularBind('datasetOutputPath', datasetOutputPath, '20180221-124127_25791550'); z.angularBind('selectedAlgorithm', selectedAlgorithm, '20180221-124127_25791550'); z.angularBind('selectedEvalMeth', selectedEvalMeth1, '20180221-124127_25791550'); z.runParagraph('20180221-124127_25791550')\" ng-disabled=\"selectedAlgorithm === undefined || selectedAlgorithm === null || (selectedAlgorithmFamily.id === 4 && (selectedEvalMeth1 === null || selectedEvalMeth1 === undefined))\">Set & Run Analysis</button>\n </div>\n </div>\n </form>\n <br>\n <br>\n <div class=\"row\" ng-show=\"result\">\n <div class=\"col-sm-12\">\n <h3>Result</h3>\n </div>\n <div class=\"col-sm-12\">\n <h5 class=\"text-success\">{{result1}}</h5>\n <h5 class=\"text-success\">{{result2}}</h5>\n {{ arrayObjects }}\n <h5 ng-repeat=\"object in arrayObjects\">{{object.values}}</h5>\n </div>\n </div>\n <div class=\"row\" ng-show=\"error\">\n <div class=\"col-sm-12\">\n <h3>Error</h3>\n </div>\n <div class=\"col-sm-12\">\n <h5 class=\"text-danger\">{{ errorMessage }} </h5>\n </div>\n </div>\n</div>\"\"\")","user":"suite5__PROJECTGENERICUSER","dateUpdated":"2018-08-28T17:36:05+0200","config":{"tableHide":false,"editorSetting":{"language":"scala","editOnDblClick":false},"colWidth":9,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"results":{"0":{"graph":{"mode":"table","height":997.2,"optionOpen":false}},"1":{"graph":{"mode":"table","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{},"tableColumnTypeState":{"names":{"pcaFeatures":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false}},"commonSetting":{}}}},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"ANGULAR","data":"<div class=\"container\">\n\n <h1>Set Analysis</h1>\n <form>\n <div class=\"row\">\n <div class=\"col-sm-4\">\n <h3>Set Dataset Path and Separator</h3>\n </div>\n <div class=\"form-group col-sm-6\" style=\"margin-top: 20px;\">\n <input type=\"text\" class=\"form-control\" id=\"datasetPath\" ng-model=\"datasetPath\" placeholder=\"Path\">\n </div>\n <div class=\"form-group col-sm-2\" style=\"margin-top: 20px;\">\n <input type=\"text\" class=\"form-control\" id=\"separator\" ng-model=\"datasetSeparator\" placeholder=\"Delimiter\">\n </div>\n </div>\n <br>\n <div class=\"row\">\n <div class=\"col-sm-4\">\n <h3>Set Output Path</h3>\n </div>\n <div class=\"form-group col-sm-12\" style=\"margin-top: 20px;\">\n <input type=\"text\" class=\"form-control\" id=\"datasetOutputPath\" ng-model=\"datasetOutputPath\" placeholder=\"Output Path without filename (append slash ( / ) in the end)\">\n </div>\n </div>\n <br>\n <div class=\"row\">\n <div class=\"col-sm-4\">\n <h3>Set Analysis Type</h3>\n </div>\n <div class=\"form-group col-sm-4\">\n <select id=\"algorithmFamily\" name=\"algorithmFamily\" ng-options=\"x.name for x in algorithmFamilies\"\n ng-model=\"selectedAlgorithmFamily\" class=\"form-control\" style=\"margin-top: 17px; margin-bottom: 10px;\">\n <option value=\"\" disabled selected>Algorithm Family</option>\n </select>\n </div>\n <div class=\"form-group col-sm-4\">\n <select id=\"algorithm\" name=\"algorithm\" ng-options=\"x.name for x in algorithmFamilies[selectedAlgorithmFamily.id].algorithms\"\n ng-model=\"selectedAlgorithm\" class=\"form-control\" style=\"margin-top: 17px; margin-bottom: 10px;\">\n <option value=\"\" disabled selected>Algorithm</option>\n </select>\n </div>\n </div>\n <br>\n <div class=\"row\">\n <div class=\"col-sm-12\">\n <div class=\"well\">\n <h4 ng-if=\"selectedAlgorithm !== undefined\" ng-bind-html=\"selectedAlgorithm.information\"></h4>\n <h4 ng-if=\"selectedAlgorithm == undefined\" style=\"color: darkgray\">Algorithm Information</h4>\n </div>\n </div>\n </div>\n <br>\n <br>\n <div class=\"row\" ng-if=\"selectedAlgorithm !== undefined && selectedAlgorithm.parameters.length > 0\">\n <div class=\"col-sm-12\">\n <h3>Set Configuration for Analysis</h3>\n </div>\n </div>\n <div class=\"row\" ng-if=\"selectedAlgorithm !== undefined && selectedAlgorithm.parameters.length > 0\">\n <br>\n <div class=\"form-horizontal col-sm-8\" ng-repeat=\"parameter in selectedAlgorithm.parameters\">\n <div class=\"form-group\">\n <label for=\"exampleInputName1\" class=\"col-sm-2 control-label\">{{ parameter.name }} ({{ parameter.kind }})</label>\n <div class=\"col-sm-7\">\n <input type=\"text\" class=\"form-control\" id=\"exampleInputName1\" ng-model=\"parameter.value\">\n </div>\n <div class=\"col-sm-1\">\n <i class=\"fa fa-info-circle\" style=\"font-size: 20px; margin-top: 7px;\" data-toggle=\"tooltip\"\n data-placement=\"right\" title=\"{{ parameter.information }}\"></i>\n </div>\n </div>\n </div>\n </div>\n <br>\n <div class=\"row\" ng-show=\"selectedAlgorithm !== null && selectedAlgorithm !== undefined && selectedAlgorithmFamily.id === 4\">\n <div class=\"col-sm-4\">\n <h3>Evaluation Methods</h3>\n </div>\n <div class=\"form-group col-sm-4\">\n <select id=\"evaluationMethod1\" name=\"evaluationMethod1\" ng-options=\"method.name for method in evaluationMethodFamily.methods\"\n ng-model=\"selectedEvalMeth1\" class=\"form-control\" style=\"margin-top: 17px; margin-bottom: 10px;\">\n <option value=\"\" disabled selected>Evaluation Method</option>\n </select>\n </div>\n \n <br>\n <div class=\"form-horizontal col-sm-8\" ng-repeat=\"parameter in selectedEvalMeth1.parameters\">\n <div class=\"form-group\">\n <label for=\"exampleInputName2\" class=\"col-sm-2 control-label\">{{ parameter.information }}</label>\n <div class=\"col-sm-7\">\n <input type=\"text\" class=\"form-control\" id=\"exampleInputName2\" ng-model=\"parameter.value\">\n </div>\n </div>\n </div>\n </div>\n <div class=\"row\">\n <br>\n <div class=\"form-group col-sm-7 text-center\">\n <button class=\"btn btn-primary text-center\" style=\"width:200px;\" ng-click=\"z.angularBind('datasetPath', datasetPath, '20180221-124127_25791550'); z.angularBind('datasetSeparator', datasetSeparator, '20180221-124127_25791550'); z.angularBind('datasetOutputPath', datasetOutputPath, '20180221-124127_25791550'); z.angularBind('selectedAlgorithm', selectedAlgorithm, '20180221-124127_25791550'); z.angularBind('selectedEvalMeth', selectedEvalMeth1, '20180221-124127_25791550'); z.runParagraph('20180221-124127_25791550')\" ng-disabled=\"selectedAlgorithm === undefined || selectedAlgorithm === null || (selectedAlgorithmFamily.id === 4 && (selectedEvalMeth1 === null || selectedEvalMeth1 === undefined))\">Set & Run Analysis</button>\n </div>\n </div>\n </form>\n <br>\n <br>\n <div class=\"row\" ng-show=\"result\">\n <div class=\"col-sm-12\">\n <h3>Result</h3>\n </div>\n <div class=\"col-sm-12\">\n <h5 class=\"text-success\">{{result1}}</h5>\n <h5 class=\"text-success\">{{result2}}</h5>\n {{ arrayObjects }}\n <h5 ng-repeat=\"object in arrayObjects\">{{object.values}}</h5>\n </div>\n </div>\n <div class=\"row\" ng-show=\"error\">\n <div class=\"col-sm-12\">\n <h3>Error</h3>\n </div>\n <div class=\"col-sm-12\">\n <h5 class=\"text-danger\">{{ errorMessage }} </h5>\n </div>\n </div>\n</div>\n"}]},"apps":[],"jobName":"paragraph_1532679401523_-447371956","id":"20180215-104049_950800735","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T17:36:06+0200","dateFinished":"2018-08-28T17:36:38+0200","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:209"},{"title":"Dataset Result","text":"val displayDataframe = z.get(\"displayDataframe\").asInstanceOf[Boolean]\nif (displayDataframe) {\n var datasetResult = z.get(\"datasetResult\")\n if (datasetResult != null) {\n z.show(datasetResult)\n }\n}\n// save part - Note the coalesce(1) => will bring all result to a single file otherwise you'll have 1 file per executor \n// dataframe.coalesce(1)\n// \t.write\n// \t.option(\"header\", \"true\")\n// \t.csv(\"/path/to/sample_file.csv\") ","user":"anonymous","dateUpdated":"2018-08-28T17:38:42+0200","config":{"tableHide":false,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","fontSize":9,"editorHide":true,"title":true,"results":{"0":{"graph":{"mode":"table","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{"columns":[],"scrollFocus":{},"selection":[],"grouping":{"grouping":[],"aggregations":[],"rowExpandedStates":{}},"treeView":{},"pagination":{"paginationCurrentPage":1,"paginationPageSize":250}},"tableColumnTypeState":{"names":{"pcaFeatures":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false},"multiBarChart":{"rotate":{"degree":"-45"},"xLabelStatus":"default"}},"commonSetting":{},"keys":[{"name":"pcaFeatures","index":0,"aggr":"sum"}],"groups":[],"values":[]},"helium":{}},"1":{"graph":{"mode":"scatterChart","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{"columns":[],"scrollFocus":{},"selection":[],"grouping":{"grouping":[],"aggregations":[],"rowExpandedStates":{}},"treeView":{},"pagination":{"paginationCurrentPage":1,"paginationPageSize":250}},"tableColumnTypeState":{"names":{"features":"string","prediction":"string","_c1":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false},"multiBarChart":{"rotate":{"degree":"-45"},"xLabelStatus":"default"},"scatterChart":{"yAxis":{"name":"pcaFeatures","index":0,"aggr":"sum"}},"lineChart":{"rotate":{"degree":"-45"},"xLabelStatus":"default"},"stackedAreaChart":{"rotate":{"degree":"-45"},"xLabelStatus":"default"}},"keys":[],"groups":[],"values":[],"commonSetting":{}},"helium":{}}},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"displayDataframe: Boolean = true\n"},{"type":"TABLE","data":"features\tprediction\t_c1\n[1782.0]\t2.3100208092314903E7\t9516688.918558078\n[5351.0]\t3.2162852140772358E7\t1163704.7766375195\n[7669.0]\t3.8048873128254674E7\t1471270.983213429\n[9026.0]\t4.149465849495593E7\t2.624218636429714E7\n[21089.0]\t7.21258367399944E7\t6.131414449774677E7\n[24498.0]\t8.078219806282754E7\t16779.68191345087\n[24551.0]\t8.09167792156906E7\t1.3111348464619492E8\n[28950.0]\t9.20870149033242E7\t6295879.888554699\n[31796.0]\t9.931376888536677E7\t9.24436691379561E7\n[34796.0]\t1.0693156999082275E8\t2.8284065353455987E7\n[41952.0]\t1.2510256489437039E8\t28734.640200550693\n[52439.0]\t1.5173185829200935E8\t2.8E7\n[60398.0]\t1.7194188462478402E8\t5.5E8\n"}]},"apps":[],"jobName":"paragraph_1532679401523_-447371956","id":"20180327-125756_1304518740","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T17:37:29+0200","dateFinished":"2018-08-28T17:37:30+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:210"},{"title":"Algorithm Initialization","text":"// MARK: - Class Declaration\n\ncase class Parameter(name: String, kind: String, information: String, value: String = \"\")\ncase class ParameterGroup(name: String, information: String, parameters: Array[Parameter])\ncase class Algorithm(id: String, name: String, information: String, parameters: Array[Parameter], paragraphId: String = \"\")\ncase class AlgorithmFamily(id: Integer, name: String, algorithms: Array[Algorithm])\n\ncase class EvaluationMethod(name: String, value: String, parameters: Array[Parameter])\ncase class EvaluationMethodFamily(methods: Array[EvaluationMethod])\n\n// MARK: - Algorithm Instantiation\n\nval a1 = Algorithm(\"stringIndexer\", \"String Indexer\", \"asdjiajdoiasjdojasd\", Array())\nval a2 = Algorithm(\"oneHotEncoder\", \"OneHotEncoder\", \"qqqqqqqqqqqqqqqqqqqqqqqq\", Array())\nval a3 = Algorithm(\"binarizer\", \"Binarizer\", \"eeeeeeeeeeeeeeeeeeeeee\", Array())\nval a4 = Algorithm(\"normalizer\", \"Normalizer\",\"rrrrrrrrrrrrrrrr\", Array())\n\n\n\nval b1 = Algorithm(\"pca\", \"PCA\", \"Principal Component Analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a NEW set of values of linearly uncorrelated variables called principal components. Output is named pcaFeatures. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-features.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numTopFeatures\", \"Integer\", \"Number of top features\", \"1\")), \"20180314-100351_1767145110\")\n\nval b2 = Algorithm(\"svd\", \"SVD\", \"zzzzzzzzzzzzzzzzzzzzzzzzzzz\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numTopFeatures\", \"Integer\", \"Number of top features\")), \"20180314-113307_1167994792\")\n\nval b3 = Algorithm(\"chiSquared\", \"ChiSquared\", \"Chi-Squared feature selection operates on labeled data with categorical features. It uses the Chi-Squared test of independence to decide which features to choose. Output is named “chiFeatures”. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-features.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numTopFeatures\", \"Integer\", \"Number of top features\", \"1\")), \"20180314-113514_1190037522\")\n\n\n\nval c1 = Algorithm(\"tokenizer\", \"Tokenizer\", \"Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). Here we use RegexTokenizer that converts the input string to lowercase, removes stopwords and then splits it by white spaces. Output is named 'tokensOut'. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-features.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\")), \"20180314-113929_1964205303\")\n\nval c2 = Algorithm(\"nGram\", \"n-gram\", \"An n-gram is a sequence of n tokens (typically words) for some integer n. This function can be used to transform input features into n-grams, taking as input a sequence of strings (e.g. the output of the Tokenizer) and the output will consist of a sequence of n-grams where each n-gram is represented by a space-delimited string of n consecutive words. Output is named “ngramsOut”. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-features.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numOfTerms\", \"Integer\", \"Number of Terms\", \"2\")), \"20180314-114545_1909431260\")\n\nval c3 = Algorithm(\"tfIdf\", \"TF-IDF\", \"Term frequency-inverse document frequency (TF-IDF) is a feature vectorization method widely used in text mining to reflect the importance of a term to a document in a corpus. Output is named “tfidfOut”. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-features.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numOfTopFeatures\", \"Integer\", \"Number of Top features\", \"2\"), Parameter(\"minDocFrequency\", \"Integer\", \"minDocFrequency\", \"0\")), \"20180314-114722_563562800\")\n\nval c4 = Algorithm(\"word2vec\", \"Word2Vec\", \" <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-features.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"vectorSize\", \"Integer\", \"\"), Parameter(\"minCount\", \"Integer\", \"\"), Parameter(\"maxSentenceLength\", \"Integer\", \"\"), Parameter(\"seed\", \"Integer\", \"\")))\n\n\n\nval d1 = Algorithm(\"als\", \"Collaborative Filtering (ALS)\", \"Collaborative Filtering produces recommendations based on what similar users like and aims to fill in the missing entries of a user-item- rating association matrix. Alternating Least Squares (ALS) matrix factorization is commonly used as a collaborative filtering algorithm. ALS models the rating matrix (R) as the multiplication of low-rank user (U) and product (V) factors and learns these factors by minimizing the reconstruction error of the observed ratings in an iterative procedure. Input data should contain 3 columns userCol(user ids), ItemCol (item ids), RatingCol (rating). <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"Rank\", \"Integer\", \"Rank\", \"10\"), Parameter(\"maxIterations\", \"Integer\", \"Max Iterations\", \"10\"), Parameter(\"regularizationParam\", \"Float\", \"regularizationParam\", \"0.0\")), \"20180314-114730_1399433199\")\n\n\n\nval e1 = Algorithm(\"kMeans\", \"k-means\", \"It is one of the most commonly used clustering algorithms that clusters the data points into a predefined number of clusters. The algorithm generates a model. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numberOfClusters\", \"Integer\", \"Number of clusters\", \"2\"), Parameter(\"maxIterations\", \"Integer\", \"Maximum Iterations\", \"20\")), \"20180314-120138_1441676951\")\n\nval e2 = Algorithm(\"gaussianMixtures\", \"Gaussian Mixtures\", \"A Gaussian Mixture Model represents a composite distribution whereby points are drawn from one of k Gaussian sub-distributions, each with its own probability. This implementation uses the expectation-maximization algorithm to induce the maximum-likelihood model given a set of samples. The algorithm generates a model, with the predicted cluster center and probability of each cluster. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"numberOfClusters\", \"Integer\", \"Number of clusters\", \"2\"), Parameter(\"maxIterations\", \"Integer\", \"Maximum Iterations\", \"20\")), \"20180314-120159_2006536130\")\n\n\n\nval f1 = Algorithm(\"linearRegression\", \"Linear regression (OLS)\", \"Ordinary Least squares (OLS) is the simplest and most common linear regressor. The learning objective of OLS is to minimize the sum of squared residuals, in order to estimate the coefficients of the linear regression expression. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxIterations\", \"Integer\", \"Max Iterations\", \"10\"), Parameter(\"regularizationParam\", \"Float\", \"regularizationParam\", \"0.3\")), \"20180314-121848_2079236016\")\n\nval f2 = Algorithm(\"decisionTreesRegression\", \"Decision trees Regression (DTR)\", \"The decision tree is a greedy algorithm that performs a recursive binary partitioning of the feature space. Each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node. The impurity method used is the ‘variance’. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxDepth\", \"Integer\", \"Maximum Depth\", \"5\"), Parameter(\"maxBins\", \"Integer\", \"maxBins\", \"32\"), Parameter(\"minInstancesPerNode\", \"Integer\", \"minInstancesPerNode\", \"1\"), Parameter(\"minInfoGain\", \"Float\", \"minInfoGain\", \"0.0\")), \"20180314-121851_102488565\")\n\nval f3 = Algorithm(\"decisionTreesClassifier\", \"Decision trees Classifier (DTC)\", \"The decision tree is a greedy algorithm that performs a recursive binary partitioning of the feature space. Each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node. The impurity method used is the ‘entropy’. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxDepth\", \"Integer\", \"Maximum Depth\", \"5\"), Parameter(\"maxBins\", \"Integer\", \"maxBins\", \"32\"), Parameter(\"minInstancesPerNode\", \"Integer\", \"minInstancesPerNode\", \"1\"), Parameter(\"minInfoGain\", \"Float\", \"minInfoGain\", \"0.0\")), \"20180314-121852_1052604787\")\n\nval f4 = Algorithm(\"svm\", \"SVM\", \"A support vector machine (SVM) constructs a hyperplane or set of hyperplanes in a high- or infinite-dimensional space, which can be used for classification or regression. Outputs the model and predictionCol. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxIterations\", \"Integer\", \"Max Iterations\", \"100\"), Parameter(\"regularizationParam\", \"Float\", \"regularizationParam\", \"0.01\"), Parameter(\"convergenceTolerance\", \"Float\", \"convergenceTolerance\", \"0.001\")), \"20180314-121853_1193799050\")\n\nval f5 = Algorithm(\"mlp\", \"Multi-layer Perceptron (MLP)\", \"Multilayer perceptron (MLP) classifier is based on the feedforward artificial neural network. MLP classifier consists of multiple layers of nodes fully interconnected with each other. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights and bias. Each layer has sigmoid activation function, output layer has softmax. Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxIterations\", \"Integer\", \"Max Iterations\", \"100\"), Parameter(\"step\", \"Float\", \"step\", \"0.1\"), Parameter(\"convergenceTolerance\", \"Float\", \"convergenceTolerance\", \"0.001\"), Parameter(\"layers\", \"Comma separated Integers\", \"example 33,72,25\"), Parameter(\"seed\", \"Integer\", \"seed\")), \"20180314-121854_1204567882\")\n\nval f6 = Algorithm(\"nb\", \"Naïve Bayes (NB)\", \"Naïve Bayes (NB) is a simple multiclass classification algorithm based on applying Bayes’ theorem with strong (naive) independence assumptions between the features. This version supports multinomial NB. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"smoothing\", \"Float\", \"smoothing\", \"1.0\")), \"20180314-121854_1598665997\")\n\nval dp1 =Parameter(\"poisson\", \"string\",\"\", \"Poisson\")\nval dp2 =Parameter(\"gaussian\", \"string\",\"\",\"Gaussian\")\nval dp3 =Parameter(\"binomial\", \"string\",\"\",\"Binomial\")\nval dp4 =Parameter(\"gamma\", \"string\",\"\",\"Gamma\")\nval dp5 =Parameter(\"tweedie\", \"string\",\"\",\"Tweedie\")\nval distrParamArray =Array(dp1,dp2,dp3,dp4,dp5)\n\nval f7 = Algorithm(\"glm\", \"Generalized Linear Models (GLM)\", \"Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLM) are specifications of linear models where the response variable follows some distribution from the exponential family of distributions. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxIterations\", \"Integer\", \"Max Iterations\", \"10\"), Parameter(\"regularizationParam\", \"Float\", \"regularizationParam\", \"0.3\"), Parameter(\"distFamily\", \"String\", \"Select Distribution Family (poisson, gaussian, binomial, gamma, tweedie)\", \"poisson\")), \"20180327-125233_1985102579\")\n\nval f8 = Algorithm(\"randomForestRegression\", \"Random Forest Regressor (RFR)\", \"Random forests construct a group of decision trees at training time and use the mean outcome as the product of the system. This method overcomes the overfitting issue of individual decision trees. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxDepth\", \"Integer\", \"Maximum Depth\", \"5\"), Parameter(\"maxBins\", \"Integer\", \"maxBins\", \"32\"), Parameter(\"minInstancesPerNode\", \"Integer\", \"minInstancesPerNode\", \"1\"), Parameter(\"minInfoGain\", \"Float\", \"minInfoGain\", \"0.0\"), Parameter(\"numTrees\", \"Integer\", \"number of Trees\", \"10\")), \"20180828-122052_380805630\")\n\nval f9 = Algorithm(\"gradientBoostedTreeRegression\", \"Gradient-boosted tree Regression (GBTR)\", \"Another popular ensemble of decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxDepth\", \"Integer\", \"Maximum Depth\", \"5\"), Parameter(\"maxBins\", \"Integer\", \"maxBins\", \"32\"), Parameter(\"minInstancesPerNode\", \"Integer\", \"minInstancesPerNode\", \"1\"), Parameter(\"minInfoGain\", \"Float\", \"minInfoGain\", \"0.0\")), \"20180828-125333_2013070982\")\n\nval f10 = Algorithm(\"randomForestClassifier\", \"Random Forest Classifier (RFC)\", \"Random forests construct a group of decision trees at training time and use the mean outcome as the product of the system. This method overcomes the overfitting issue of individual decision trees. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxDepth\", \"Integer\", \"Maximum Depth\", \"5\"), Parameter(\"maxBins\", \"Integer\", \"maxBins\", \"32\"), Parameter(\"minInstancesPerNode\", \"Integer\", \"minInstancesPerNode\", \"1\"), Parameter(\"minInfoGain\", \"Float\", \"minInfoGain\", \"0.0\"), Parameter(\"numTrees\", \"Integer\", \"number of Trees\", \"10\")), \"20180828-131249_1439234059\")\n\nval f11 = Algorithm(\"gradientBoostedTreeClassifier\", \"Gradient-boosted tree Classifier (GBTC)\", \"Another popular ensemble of decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function (note: BINARY classification only available). Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\\\"_blank\\\" href=\\\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\\\">More…</a>\", Array(Parameter(\"labelCol\", \"Integer\", \"Label Column (Index of Label column, a.k.a. output column, if any)\", \"-1\"), Parameter(\"maxDepth\", \"Integer\", \"Maximum Depth\", \"5\"), Parameter(\"maxBins\", \"Integer\", \"maxBins\", \"32\"), Parameter(\"minInstancesPerNode\", \"Integer\", \"minInstancesPerNode\", \"1\"), Parameter(\"minInfoGain\", \"Float\", \"minInfoGain\", \"0.0\")), \"20180828-131253_989652806\")\n\nval g1 = Algorithm(\"trainValidationSplit\", \"Train-Validation Split\", \"train_validation_split_info\", Array(Parameter(\"ttr\", \"Float\", \"train-test ratio\")))\n\nval g2 = Algorithm(\"crossValidation\", \"Cross-Validation\", \"cross_validation_info\", Array(Parameter(\"k\", \"Integer\", \"Number of splits\")))\n\n// ParameterGroup(name: String, information: String, parameters: Array[Parameter])\n\n// MARK: - Algorithm Families Instantiation\n\nval algorithmArray1 = Array(a1, a2, a3, a4)\nval family1 = AlgorithmFamily(0, \"DATA TRANSFORMATION [INPUT COLs]\", algorithmArray1)\n\nval algorithmArray2 = Array(b1, b3)\nval family2 = AlgorithmFamily(0, \"DIMENSIONALITY REDUCTION/FEATURE EXTRACTION/SELECTION\", algorithmArray2)\n\nval algorithmArray3 = Array(c1, c2, c3)\nval family3 = AlgorithmFamily(1, \"NLP FUNCTIONS\", algorithmArray3)\n\nval algorithmArray4 = Array(d1)\nval family4 = AlgorithmFamily(2, \"RECOMMENDERS\", algorithmArray4)\n\nval algorithmArray5 = Array(e1, e2)\nval family5 = AlgorithmFamily(3, \"CLUSTERING\", algorithmArray5)\n\nval algorithmArray6 = Array(f1, f7, f2, f3, f8, f10, f9, f11, f5, f6)\nval family6 = AlgorithmFamily(4, \"CLASSIFICATION/REGRESSION\", algorithmArray6)\n\nval algorithmArray7 = Array(g1, g2)\nval family7 = AlgorithmFamily(5, \"EVALUATION METHODS\", algorithmArray7)\n\n// family1 (data transformation) is not included for now\n\nval algorithmFamilies = Array(family2, family3, family4, family5, family6) \nz.angularBind(\"algorithmFamilies\", algorithmFamilies)\n\nval evalMethod1 = EvaluationMethod(\"Split data by train-test ratio\", \"trainSplit\", Array(Parameter(\"train\", \"\", \"Train (%)\", \"50\"), Parameter(\"test\", \"\", \"Test (%)\", \"50\")))\nval evalMethod2 = EvaluationMethod(\"Split data by Cross Validation\", \"crossValidation\", Array(Parameter(\"xCrossValidation\", \"\", \"Set x for x-fold cross validation\", \"-1\")))\nval methodsArray = Array(evalMethod1, evalMethod2)\nval evaluationMethodFamily = EvaluationMethodFamily(methodsArray)\n\nz.angularBind(\"evaluationMethodFamily\", evaluationMethodFamily)\n\nz.angularBind(\"error\", false)\nz.angularBind(\"result\", false)","user":"suite5__PROJECTGENERICUSER","dateUpdated":"2018-08-28T15:35:27+0200","config":{"tableHide":true,"editorSetting":{"language":"scala","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"defined class Parameter\ndefined class ParameterGroup\ndefined class Algorithm\ndefined class AlgorithmFamily\ndefined class EvaluationMethod\ndefined class EvaluationMethodFamily\na1: Algorithm = Algorithm(stringIndexer,String Indexer,asdjiajdoiasjdojasd,[LParameter;@25f884b1,)\na2: Algorithm = Algorithm(oneHotEncoder,OneHotEncoder,qqqqqqqqqqqqqqqqqqqqqqqq,[LParameter;@70807bfb,)\na3: Algorithm = Algorithm(binarizer,Binarizer,eeeeeeeeeeeeeeeeeeeeee,[LParameter;@3f6190c5,)\na4: Algorithm = Algorithm(normalizer,Normalizer,rrrrrrrrrrrrrrrr,[LParameter;@23a7ad71,)\nb1: Algorithm = Algorithm(pca,PCA,Principal Component Analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a NEW set of values of linearly uncorrelated variables called principal components. Output is named pcaFeatures. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@c5c4539,20180314-100351_1767145110)\nb2: Algorithm = Algorithm(svd,SVD,zzzzzzzzzzzzzzzzzzzzzzzzzzz,[LParameter;@468b3a92,20180314-113307_1167994792)\nb3: Algorithm = Algorithm(chiSquared,ChiSquared,Chi-Squared feature selection operates on labeled data with categorical features. It uses the Chi-Squared test of independence to decide which features to choose. Output is named “chiFeatures”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@40a9a414,20180314-113514_1190037522)\nc1: Algorithm = Algorithm(tokenizer,Tokenizer,Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). Here we use RegexTokenizer that converts the input string to lowercase, removes stopwords and then splits it by white spaces. Output is named 'tokensOut'. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@6970ebfc,20180314-113929_1964205303)\nc2: Algorithm = Algorithm(nGram,n-gram,An n-gram is a sequence of n tokens (typically words) for some integer n. This function can be used to transform input features into n-grams, taking as input a sequence of strings (e.g. the output of the Tokenizer) and the output will consist of a sequence of n-grams where each n-gram is represented by a space-delimited string of n consecutive words. Output is named “ngramsOut”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@1b2b6606,20180314-114545_1909431260)\nc3: Algorithm = Algorithm(tfIdf,TF-IDF,Term frequency-inverse document frequency (TF-IDF) is a feature vectorization method widely used in text mining to reflect the importance of a term to a document in a corpus. Output is named “tfidfOut”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@1970bde2,20180314-114722_563562800)\nc4: Algorithm = Algorithm(word2vec,Word2Vec, <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@6e6be65c,)\nd1: Algorithm = Algorithm(als,Collaborative Filtering (ALS),Collaborative Filtering produces recommendations based on what similar users like and aims to fill in the missing entries of a user-item- rating association matrix. Alternating Least Squares (ALS) matrix factorization is commonly used as a collaborative filtering algorithm. ALS models the rating matrix (R) as the multiplication of low-rank user (U) and product (V) factors and learns these factors by minimizing the reconstruction error of the observed ratings in an iterative procedure. Input data should contain 3 columns userCol(user ids), ItemCol (item ids), RatingCol (rating). <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html\">More…</a>,[LParameter;@159230fd,20180314-114730_1399433199)\ne1: Algorithm = Algorithm(kMeans,k-means,It is one of the most commonly used clustering algorithms that clusters the data points into a predefined number of clusters. The algorithm generates a model. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\">More…</a>,[LParameter;@51ea182f,20180314-120138_1441676951)\ne2: Algorithm = Algorithm(gaussianMixtures,Gaussian Mixtures,A Gaussian Mixture Model represents a composite distribution whereby points are drawn from one of k Gaussian sub-distributions, each with its own probability. This implementation uses the expectation-maximization algorithm to induce the maximum-likelihood model given a set of samples. The algorithm generates a model, with the predicted cluster center and probability of each cluster. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\">More…</a>,[LParameter;@1b863a0e,20180314-120159_2006536130)\nf1: Algorithm = Algorithm(linearRegression,Linear regression (OLS),Ordinary Least squares (OLS) is the simplest and most common linear regressor. The learning objective of OLS is to minimize the sum of squared residuals, in order to estimate the coefficients of the linear regression expression. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@1c723855,20180314-121848_2079236016)\nf2: Algorithm = Algorithm(decisionTreesRegression,Decision trees Regression (DTR),The decision tree is a greedy algorithm that performs a recursive binary partitioning of the feature space. Each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node. The impurity method used is the ‘variance’. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@228c7931,20180314-121851_102488565)\nf3: Algorithm = Algorithm(decisionTreesClassifier,Decision trees Classifier (DTC),The decision tree is a greedy algorithm that performs a recursive binary partitioning of the feature space. Each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node. The impurity method used is the ‘entropy’. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@1d33e38a,20180314-121852_1052604787)\nf4: Algorithm = Algorithm(svm,SVM,A support vector machine (SVM) constructs a hyperplane or set of hyperplanes in a high- or infinite-dimensional space, which can be used for classification or regression. Outputs the model and predictionCol. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@5064dd37,20180314-121853_1193799050)\nf5: Algorithm = Algorithm(mlp,Multi-layer Perceptron (MLP),Multilayer perceptron (MLP) classifier is based on the feedforward artificial neural network. MLP classifier consists of multiple layers of nodes fully interconnected with each other. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights and bias. Each layer has sigmoid activation function, output layer has softmax. Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-cl...f6: Algorithm = Algorithm(nb,Naïve Bayes (NB),Naïve Bayes (NB) is a simple multiclass classification algorithm based on applying Bayes’ theorem with strong (naive) independence assumptions between the features. This version supports multinomial NB. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@5a5f773b,20180314-121854_1598665997)\ndp1: Parameter = Parameter(poisson,string,,Poisson)\ndp2: Parameter = Parameter(gaussian,string,,Gaussian)\ndp3: Parameter = Parameter(binomial,string,,Binomial)\ndp4: Parameter = Parameter(gamma,string,,Gamma)\ndp5: Parameter = Parameter(tweedie,string,,Tweedie)\ndistrParamArray: Array[Parameter] = Array(Parameter(poisson,string,,Poisson), Parameter(gaussian,string,,Gaussian), Parameter(binomial,string,,Binomial), Parameter(gamma,string,,Gamma), Parameter(tweedie,string,,Tweedie))\nf7: Algorithm = Algorithm(glm,Generalized Linear Models (GLM),Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLM) are specifications of linear models where the response variable follows some distribution from the exponential family of distributions. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@1a246cf6,20180327-125233_1985102579)\nf8: Algorithm = Algorithm(randomForestRegression,Random Forest Regressor (RFR),Random forests construct a group of decision trees at training time and use the mean outcome as the product of the system. This method overcomes the overfitting issue of individual decision trees. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@adbebdb,20180828-122052_380805630)\nf9: Algorithm = Algorithm(gradientBoostedTreeRegression,Gradient-boosted tree Regression (GBTR),Another popular ensemble of decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@91eac87,20180828-125333_2013070982)\nf10: Algorithm = Algorithm(randomForestClassifier,Random Forest Classifier (RFC),Random forests construct a group of decision trees at training time and use the mean outcome as the product of the system. This method overcomes the overfitting issue of individual decision trees. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@f7700f1,20180828-131249_1439234059)\nf11: Algorithm = Algorithm(gradientBoostedTreeClassifier,Gradient-boosted tree Classifier (GBTC),Another popular ensemble of decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function (note: BINARY classification only available). Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@6889017f,20180828-131253_989652806)\ng1: Algorithm = Algorithm(trainValidationSplit,Train-Validation Split,train_validation_split_info,[LParameter;@1b1659d4,)\ng2: Algorithm = Algorithm(crossValidation,Cross-Validation,cross_validation_info,[LParameter;@686b9739,)\nalgorithmArray1: Array[Algorithm] = Array(Algorithm(stringIndexer,String Indexer,asdjiajdoiasjdojasd,[LParameter;@25f884b1,), Algorithm(oneHotEncoder,OneHotEncoder,qqqqqqqqqqqqqqqqqqqqqqqq,[LParameter;@70807bfb,), Algorithm(binarizer,Binarizer,eeeeeeeeeeeeeeeeeeeeee,[LParameter;@3f6190c5,), Algorithm(normalizer,Normalizer,rrrrrrrrrrrrrrrr,[LParameter;@23a7ad71,))\nfamily1: AlgorithmFamily = AlgorithmFamily(0,DATA TRANSFORMATION [INPUT COLs],[LAlgorithm;@13899f10)\nalgorithmArray2: Array[Algorithm] = Array(Algorithm(pca,PCA,Principal Component Analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a NEW set of values of linearly uncorrelated variables called principal components. Output is named pcaFeatures. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@c5c4539,20180314-100351_1767145110), Algorithm(chiSquared,ChiSquared,Chi-Squared feature selection operates on labeled data with categorical features. It uses the Chi-Squared test of independence to decide which features to choose. Output is named “chiFeatures”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a...family2: AlgorithmFamily = AlgorithmFamily(0,DIMENSIONALITY REDUCTION/FEATURE EXTRACTION/SELECTION,[LAlgorithm;@3183db9e)\nalgorithmArray3: Array[Algorithm] = Array(Algorithm(tokenizer,Tokenizer,Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). Here we use RegexTokenizer that converts the input string to lowercase, removes stopwords and then splits it by white spaces. Output is named 'tokensOut'. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>,[LParameter;@6970ebfc,20180314-113929_1964205303), Algorithm(nGram,n-gram,An n-gram is a sequence of n tokens (typically words) for some integer n. This function can be used to transform input features into n-grams, taking as input a sequence of strings (e.g. the output of the Tokenizer) and the output will consist of a sequence of n-grams where each n-gram is...family3: AlgorithmFamily = AlgorithmFamily(1,NLP FUNCTIONS,[LAlgorithm;@6d07badb)\nalgorithmArray4: Array[Algorithm] = Array(Algorithm(als,Collaborative Filtering (ALS),Collaborative Filtering produces recommendations based on what similar users like and aims to fill in the missing entries of a user-item- rating association matrix. Alternating Least Squares (ALS) matrix factorization is commonly used as a collaborative filtering algorithm. ALS models the rating matrix (R) as the multiplication of low-rank user (U) and product (V) factors and learns these factors by minimizing the reconstruction error of the observed ratings in an iterative procedure. Input data should contain 3 columns userCol(user ids), ItemCol (item ids), RatingCol (rating). <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html\">More…</a>,[LParameter;@159230fd,...family4: AlgorithmFamily = AlgorithmFamily(2,RECOMMENDERS,[LAlgorithm;@7fa15a97)\nalgorithmArray5: Array[Algorithm] = Array(Algorithm(kMeans,k-means,It is one of the most commonly used clustering algorithms that clusters the data points into a predefined number of clusters. The algorithm generates a model. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\">More…</a>,[LParameter;@51ea182f,20180314-120138_1441676951), Algorithm(gaussianMixtures,Gaussian Mixtures,A Gaussian Mixture Model represents a composite distribution whereby points are drawn from one of k Gaussian sub-distributions, each with its own probability. This implementation uses the expectation-maximization algorithm to induce the maximum-likelihood model given a set of samples. The algorithm generates a model, with the predicted cluster center and probability of each cluster...family5: AlgorithmFamily = AlgorithmFamily(3,CLUSTERING,[LAlgorithm;@59fcbfc8)\nalgorithmArray6: Array[Algorithm] = Array(Algorithm(linearRegression,Linear regression (OLS),Ordinary Least squares (OLS) is the simplest and most common linear regressor. The learning objective of OLS is to minimize the sum of squared residuals, in order to estimate the coefficients of the linear regression expression. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>,[LParameter;@1c723855,20180314-121848_2079236016), Algorithm(glm,Generalized Linear Models (GLM),Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLM) are specificatio...family6: AlgorithmFamily = AlgorithmFamily(4,CLASSIFICATION/REGRESSION,[LAlgorithm;@7abeec9b)\nalgorithmArray7: Array[Algorithm] = Array(Algorithm(trainValidationSplit,Train-Validation Split,train_validation_split_info,[LParameter;@1b1659d4,), Algorithm(crossValidation,Cross-Validation,cross_validation_info,[LParameter;@686b9739,))\nfamily7: AlgorithmFamily = AlgorithmFamily(5,EVALUATION METHODS,[LAlgorithm;@4b592056)\nalgorithmFamilies: Array[AlgorithmFamily] = Array(AlgorithmFamily(0,DIMENSIONALITY REDUCTION/FEATURE EXTRACTION/SELECTION,[LAlgorithm;@3183db9e), AlgorithmFamily(1,NLP FUNCTIONS,[LAlgorithm;@6d07badb), AlgorithmFamily(2,RECOMMENDERS,[LAlgorithm;@7fa15a97), AlgorithmFamily(3,CLUSTERING,[LAlgorithm;@59fcbfc8), AlgorithmFamily(4,CLASSIFICATION/REGRESSION,[LAlgorithm;@7abeec9b))\nevalMethod1: EvaluationMethod = EvaluationMethod(Split data by train-test ratio,trainSplit,[LParameter;@5a889b28)\nevalMethod2: EvaluationMethod = EvaluationMethod(Split data by Cross Validation,crossValidation,[LParameter;@202e784e)\nmethodsArray: Array[EvaluationMethod] = Array(EvaluationMethod(Split data by train-test ratio,trainSplit,[LParameter;@5a889b28), EvaluationMethod(Split data by Cross Validation,crossValidation,[LParameter;@202e784e))\nevaluationMethodFamily: EvaluationMethodFamily = EvaluationMethodFamily([LEvaluationMethod;@4f3519d2)\n"}]},"apps":[],"jobName":"paragraph_1532679401524_-449295701","id":"20180215-104235_1720176169","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T15:18:25+0200","dateFinished":"2018-08-28T15:18:36+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:211"},{"title":"Load Iris Data & txt data","text":"// var path = \"hdfs:///Projects/demo_spark_minas001/ML/iris_data.txt\"\n// var df = spark.read\n// .option(\"sep\", \",\")\n// .option(\"inferSchema\", \"true\")\n// .option(\"header\", \"false\")\n// .csv(path)\n\n// var path2 = \"hdfs:///Projects/demo_spark_minas001/ML/twitter_raw_text_sample.csv\"\n// var df_txt = spark.read\n// .option(\"sep\", \",\")\n// .option(\"inferSchema\", \"true\")\n// .option(\"header\", \"false\")\n// .csv(path2) \n \n// var path3 = \"hdfs:///Projects/demo_spark_minas001/ML/new_user_rated_music.dat\"\n// var df_rating = spark.read\n// .option(\"sep\", \"\\t\")\n// .option(\"inferSchema\", \"true\")\n// .option(\"header\", \"false\")\n// .csv(path3) ","dateUpdated":"2018-08-27T11:31:21+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","fontSize":9,"editorHide":true,"title":true,"results":{"1":{"graph":{"mode":"multiBarChart","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{},"tableColumnTypeState":{"names":{"_c0":"string","_c1":"string","_c2":"string","_c3":"string","_c4":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false},"multiBarChart":{"rotate":{"degree":"-45"},"xLabelStatus":"default"}},"commonSetting":{},"keys":[{"name":"_c0","index":0,"aggr":"sum"}],"groups":[],"values":[{"name":"_c1","index":1,"aggr":"sum"}]},"helium":{}}},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1532679401524_-449295701","id":"20180327-092105_583525245","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:212"},{"title":"Redirect Paragraph Execution","text":"%pyspark\n# import pandas as pd\n# import os\n# print(os.environ['HDFS'])\n# # Getting the data and creating the DataFrame\n# path = \"hdfs:///Projects/demo_spark_minas001/ML/iris_data.txt\"\n# pandas_df = pd.read_csv(path=path, header=None)\n# spark_df=sqlContext.createDataFrame(pandas_df)\n\n# #Set dataset and number of columns as global params\n# z.put(\"dataset\", spark_df._jdf)\n# z.put(\"numColumns\", len(pdf.columns))\n\nz.z.angularBind(\"error\", False)\nz.z.angularBind(\"result\", False)\nz.z.angularBind(\"arrayObjects\", None)\nz.z.angularBind(\"isDatasetResultPopulated\", False)\nz.put(\"displayDataframe\", False)\n\ndatasetPath = z.z.angular(\"datasetPath\")\ndatasetSeparator = z.z.angular(\"datasetSeparator\")\ndatasetOutputPath = z.z.angular(\"datasetOutputPath\")\nz.put(\"datasetPath\", datasetPath)\nz.put(\"datasetSeparator\", datasetSeparator)\nz.put(\"datasetOutputPath\", datasetOutputPath)\n\n#Select Algorithm params\nalgorithm = z.z.angular(\"selectedAlgorithm\")\nfor param in algorithm['parameters']:\n z.z.put(param['name'], param['value'])\n\nselectedEvaluationMethod = z.z.angular(\"selectedEvalMeth\")\nif selectedEvaluationMethod != None:\n z.z.put(\"evalMethod\", selectedEvaluationMethod['value'])\n for param in selectedEvaluationMethod['parameters']:\n z.z.put(param['name'], param['value'])\n\nz.z.run(algorithm['paragraphId'])\n","user":"suite5__PROJECTGENERICUSER","dateUpdated":"2018-08-28T17:37:01+0200","config":{"tableHide":false,"editorSetting":{"language":"python","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/python","editorHide":true,"fontSize":9,"title":true,"results":{"0":{"graph":{"mode":"table","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{},"tableColumnTypeState":{"names":{"pcaFeatures":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false}},"commonSetting":{}}}},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1532679401524_-449295701","id":"20180221-124127_25791550","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T17:37:02+0200","dateFinished":"2018-08-28T17:37:03+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:213"},{"title":"PCA","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.PCA\nimport org.apache.spark.ml.linalg.Vectors\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar numTopFeatures: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n numTopFeatures = z.get(\"numTopFeatures\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\ntry\n{\n val assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n // Compute the top principal components.\n val pca = new PCA()\n .setInputCol(\"features\")\n .setOutputCol(\"pcaFeatures\")\n .setK(numTopFeatures)\n .fit(assembler1)\n \n var datasetResult = pca.transform(assembler1).select(\"pcaFeatures\")\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n\n try {\n datasetResult.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"pca_dataset_\" + dateString + \".csv\")\n pca.save(datasetOutputPath + \"pca_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n // pca.write.option(\"header\", \"true\").csv(\"hdfs:///Projects/demo_spark_minas001/ML/pca_saved_model_01.csv\")\n // pca.write.format(\"com.databricks.spark.csv\").save(\"hdfs:///Projects/demo_spark_minas001/ML/pca_saved_dataset1.csv\")\n \n // Prepare result for display\n z.put(\"datasetResult\", datasetResult)\n z.put(\"displayDataframe\", true)\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"PCA ran successfully. You can find the resulted features in the table below.\")\n z.angularBind(\"result2\", \"\")\n z.angularBind(\"arrayObjects\", Array())\n z.run(\"20180327-125756_1304518740\")\n\n}catch{\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n} ","user":"anonymous","dateUpdated":"2018-08-27T17:40:04+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{"1":{"graph":{"mode":"multiBarChart","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{},"tableColumnTypeState":{"names":{"pcaFeatures":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false},"multiBarChart":{"rotate":{"degree":"-45"},"xLabelStatus":"default"}},"commonSetting":{},"keys":[],"groups":[],"values":[]},"helium":{}}},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.PCA\nimport org.apache.spark.ml.linalg.Vectors\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nnumTopFeatures: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\nlabelColReal: Int = -2\nlabelCol_str: String = _c-2\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\n"}]},"apps":[],"jobName":"paragraph_1532679401525_-449680450","id":"20180314-100351_1767145110","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-27T12:28:16+0200","dateFinished":"2018-08-27T12:28:21+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:214"},{"title":"SVD","text":"import org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.feature.Normalizer\nimport org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\n\nval numTopFeatures = z.get(\"numTopFeatures\")\n\n// var path = \"hdfs:///Projects/demo_spark_minas001/ML/iris_data.txt\"\n// var df = spark.read\n// .option(\"sep\", \",\")\n// .option(\"inferSchema\", \"true\")\n// .option(\"header\", \"false\")\n// .csv(path)\ndf.printSchema\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df.columns)\n .setOutputCol(\"features\")\n .transform(df)\n \n//assembler1.show(10)\n\nval normalizer = new Normalizer()\n .setInputCol(\"features\")\n .setOutputCol(\"normFeatures\")\n .setP(2.0)\n .transform(assembler1)\n \nnormalizer.show(10) \n\n// Split the data into training and test sets (30% held out for testing).\nval Array(trainingData, testData) = normalizer.randomSplit(Array(0.7, 0.3))\n\n// instantiate the base classifier\nval classifier = new LogisticRegression()\n .setMaxIter(10)\n .setTol(1E-6)\n .setFitIntercept(true)\n\n \n// instantiate the One Vs Rest Classifier.\nval ovr = new OneVsRest()\n .setLabelCol(\"_c4\")\n .setFeaturesCol(\"normFeatures\")\n .setClassifier(classifier) \n\n\n// train the multiclass model.\nval ovrModel = ovr.fit(trainingData)\n\n// score the model on test data.\nval predictions = ovrModel.transform(testData)\n\n// obtain evaluator.\nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(\"_c4\")\n .setMetricName(\"accuracy\")\n\n// compute the classification error on test data.\nval accuracy = evaluator.evaluate(predictions)\nprintln(s\"Test Error = ${1 - accuracy}\")","dateUpdated":"2018-08-28T12:59:13+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":false,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.feature.Normalizer\nimport org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nnumTopFeatures: Object = 3\nroot\n |-- _c0: double (nullable = true)\n |-- _c1: double (nullable = true)\n |-- _c2: double (nullable = true)\n |-- _c3: double (nullable = true)\n |-- _c4: integer (nullable = true)\n\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 4 more fields]\nnormalizer: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 5 more fields]\n+---+---+---+---+---+--------------------+--------------------+\n|_c0|_c1|_c2|_c3|_c4| features| normFeatures|\n+---+---+---+---+---+--------------------+--------------------+\n|5.1|3.5|1.4|0.2| 0|[5.1,3.5,1.4,0.2,...|[0.80377277301538...|\n|4.9|3.0|1.4|0.2| 0|[4.9,3.0,1.4,0.2,...|[0.82813287338687...|\n|4.7|3.2|1.3|0.2| 0|[4.7,3.2,1.3,0.2,...|[0.80533307538050...|\n|4.6|3.1|1.5|0.2| 0|[4.6,3.1,1.5,0.2,...|[0.80003024746205...|\n|5.0|3.6|1.4|0.2| 0|[5.0,3.6,1.4,0.2,...|[0.79096499646041...|\n|5.4|3.9|1.7|0.4| 0|[5.4,3.9,1.7,0.4,...|[0.78417498628181...|\n|4.6|3.4|1.4|0.3| 0|[4.6,3.4,1.4,0.3,...|[0.78010935569635...|\n|5.0|3.4|1.5|0.2| 0|[5.0,3.4,1.5,0.2,...|[0.80218491851981...|\n|4.4|2.9|1.4|0.2| 0|[4.4,2.9,1.4,0.2,...|[0.80642365615178...|\n|4.9|3.1|1.5|0.1| 0|[4.9,3.1,1.5,0.1,...|[0.81803119003536...|\n+---+---+---+---+---+--------------------+--------------------+\nonly showing top 10 rows\n\ntrainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [_c0: double, _c1: double ... 5 more fields]\ntestData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [_c0: double, _c1: double ... 5 more fields]\nclassifier: org.apache.spark.ml.classification.LogisticRegression = logreg_36f259386022\novr: org.apache.spark.ml.classification.OneVsRest = oneVsRest_71199ad39d5f\novrModel: org.apache.spark.ml.classification.OneVsRestModel = oneVsRest_71199ad39d5f\npredictions: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 6 more fields]\nevaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_0a42b758bef2\naccuracy: Double = 1.0\nTest Error = 0.0\n"}]},"apps":[],"jobName":"paragraph_1532679401525_-449680450","id":"20180314-113307_1167994792","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:215"},{"title":"ChiSquared","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.ChiSqSelector\nimport org.apache.spark.ml.linalg.Vectors\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar numTopFeatures: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n numTopFeatures = z.get(\"numTopFeatures\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n//else{labelCol_str = \"_c4\"}\n\ntry\n{\n val assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n \n val selector = new ChiSqSelector()\n .setNumTopFeatures(numTopFeatures)\n .setFeaturesCol(\"features\")\n .setLabelCol(labelCol_str)\n .setOutputCol(\"selectedFeatures\")\n \n val result = selector.fit(assembler1).transform(assembler1)\n var datasetResult = result.select(\"selectedFeatures\")\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n \n try {\n datasetResult.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"chiSquared_dataset_\" + dateString + \".csv\")\n selector.save(datasetOutputPath + \"chiSquared_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n // Prepare result for display\n z.put(\"datasetResult\", datasetResult)\n z.put(\"displayDataframe\", true)\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"ChiSquared ran successfully. You can find the resulted features in the table below.\")\n z.angularBind(\"result2\", \"\")\n z.angularBind(\"arrayObjects\", Array())\n z.run(\"20180327-125756_1304518740\")\n}catch{\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}","dateUpdated":"2018-08-27T12:06:37+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.ChiSqSelector\nimport org.apache.spark.ml.linalg.Vectors\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nnumTopFeatures: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\n"}]},"apps":[],"jobName":"paragraph_1532679401525_-449680450","id":"20180314-113514_1190037522","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:216"},{"title":"Tokenizer","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}\nimport org.apache.spark.sql.functions._\n\n\n//Label col will act as sentence col\nvar labelCol: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n// Get column with text\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// val sentenceDataFrame = spark.createDataFrame(Seq(\n// (0, \"Hi I heard about Spark\"),\n// (1, \"I wish Java could use case classes\"),\n// (2, \"Logistic,regression,models,are,neat\")\n// )).toDF(\"id\", \"sentence\")\n\n// val assembler1 = new VectorAssembler()\n// .setInputCols(Array(labelCol_str))\n// .setOutputCol(\"sentence\")\n// .transform(df)\n\n\n// Read Dataset from path\nvar df_txt = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n\nval sentenceDataFrame = df_txt.toDF(\"id\",\"sentence\")\nval regexTokenizer = new RegexTokenizer()\n .setInputCol(\"sentence\")\n .setOutputCol(\"words\")\n .setPattern(\"\\\\W\")\n .transform(sentenceDataFrame)// alternatively .setPattern(\"\\\\w+\").setGaps(false)\n \ntry{\n val countTokens = udf { (words: Seq[String]) => words.length } \n var datasetResult = regexTokenizer.select(\"sentence\", \"words\")\n .withColumn(\"tokens\", countTokens(col(\"words\")))\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n \n try {\n datasetResult.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"tokenizer_dataset_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n // Prepare result for display\n z.put(\"datasetResult\", datasetResult)\n z.put(\"displayDataframe\", true)\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Tokenizer ran successfully. You can view and export the resulted tokens in the table below.\")\n z.angularBind(\"result2\", \"\")\n \n z.run(\"20180327-125756_1304518740\") \n\n}catch{\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}","dateUpdated":"2018-08-27T17:40:06+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{"1":{"graph":{"mode":"table","height":300,"optionOpen":false,"setting":{"table":{"tableGridState":{},"tableColumnTypeState":{"names":{"sentence":"string","words":"string","tokens":"string"},"updated":false},"tableOptionSpecHash":"[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]","tableOptionValue":{"useFilter":false,"showPagination":false,"showAggregationFooter":false},"updated":false,"initialized":false}},"commonSetting":{}}}},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}\nimport org.apache.spark.sql.functions._\nlabelCol: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf_txt: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\njava.lang.IllegalArgumentException: requirement failed: The number of columns doesn't match.\nOld column names (5): _c0, _c1, _c2, _c3, _c4\nNew column names (2): id, sentence\n at scala.Predef$.require(Predef.scala:224)\n at org.apache.spark.sql.Dataset.toDF(Dataset.scala:397)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401525_-449680450","id":"20180314-113929_1964205303","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:217"},{"title":"N-gram","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.NGram\n\n\nvar labelCol: Integer = _\nvar numOfTerms: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n numOfTerms = z.get(\"numOfTerms\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Read Dataset from path\nvar df_txt = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n\nval df_txt = spark.createDataFrame(Seq(\n (0, Array(\"Hi\", \"I\", \"heard\", \"about\", \"Spark\")),\n (1, Array(\"I\", \"wish\", \"Java\", \"could\", \"use\", \"case\", \"classes\")),\n (2, Array(\"Logistic\", \"regression\", \"models\", \"are\", \"neat\"))\n)).toDF(\"id\", \"_c1\")\n\ntry {\n // Get column with labels\n var labelColReal=labelCol-1\n var labelCol_str = \"_c\"+labelColReal.toString\n \n \n val ngram = new NGram()\n .setN(numOfTerms)\n .setInputCol(labelCol_str)\n .setOutputCol(\"ngrams\")\n \n val ngramDataFrame = ngram.transform(df_txt)\n ngramDataFrame.select(\"ngrams\").show(false)\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n \n try {\n ngramDataFrame.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"ngram_dataset_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n","dateUpdated":"2018-08-27T17:40:08+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.NGram\nlabelCol: Integer = null\nnumOfTerms: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\njava.lang.IllegalArgumentException: Can not create a Path from a null string\n at org.apache.hadoop.fs.Path.checkPathArg(Path.java:159)\n at org.apache.hadoop.fs.Path.<init>(Path.java:175)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:349)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.immutable.List.foreach(List.scala:381)\n at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n at scala.collection.immutable.List.flatMap(List.scala:344)\n at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)\n at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401525_-449680450","id":"20180314-114545_1909431260","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:218"},{"title":"TF-IDF","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}\n\nvar labelCol: Integer = _\nvar numTopFeatures: Integer = _\nvar minDocFrequency: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n numTopFeatures = z.get(\"numTopFeatures\").asInstanceOf[String].toInt\n minDocFrequency = z.get(\"minDocFrequency\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n// Get column with text\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df_txt = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\nval sentenceDataFrame = df_txt.toDF(\"label\",\"sentence\")\n// val sentenceDataFrame = spark.createDataFrame(Seq(\n// (0.0, \"Hi I heard about Spark\"),\n// (0.0, \"I wish Java could use case classes\"),\n// (1.0, \"Logistic regression models are neat\")\n// )).toDF(\"label\", \"sentence\")\nval tokenizer = new Tokenizer().setInputCol(\"sentence\").setOutputCol(\"words\")\n\ntry{\n val wordsData = tokenizer.transform(sentenceDataFrame)\n \n val hashingTF = new HashingTF()\n .setInputCol(\"words\").setOutputCol(\"rawFeatures\").setNumFeatures(numTopFeatures)\n \n val featurizedData = hashingTF.transform(wordsData)\n // alternatively, CountVectorizer can also be used to get term frequency vectors\n \n val idf = new IDF()\n .setInputCol(\"rawFeatures\")\n .setOutputCol(\"featureVectors\")\n .setMinDocFreq(minDocFrequency)\n \n val idfModel = idf.fit(featurizedData)\n \n val rescaledData = idfModel.transform(featurizedData)\n var datasetResult = rescaledData.limit(20).select(\"label\", \"rawFeatures\", \"featureVectors\", \"sentence\")\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n \n try {\n datasetResult.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"tfIdf_dataset_\" + dateString + \".csv\")\n idfModel.save(datasetOutputPath + \"tfIdf_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n //var datasetResult = featurizedData.limit(20).select(\"label\", \"rawFeatures\")\n // Prepare result for display\n z.put(\"datasetResult\", datasetResult)\n z.put(\"displayDataframe\", true)\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"TF_IDF ran successfully. You can view the results in the table below.\")\n z.angularBind(\"result2\", \"\")\n //z.angularBind(\"arrayObjects\", rescaledData)\n z.run(\"20180327-125756_1304518740\") \n\n}catch{\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}","dateUpdated":"2018-08-27T12:11:50+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}\nlabelCol: Integer = null\nnumTopFeatures: Integer = null\nminDocFrequency: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\njava.lang.IllegalArgumentException: Can not create a Path from a null string\n at org.apache.hadoop.fs.Path.checkPathArg(Path.java:159)\n at org.apache.hadoop.fs.Path.<init>(Path.java:175)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:349)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.immutable.List.foreach(List.scala:381)\n at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n at scala.collection.immutable.List.flatMap(List.scala:344)\n at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)\n at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401526_-448526203","id":"20180314-114722_563562800","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:219"},{"title":"Collaborative Filtering (ALS)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.recommendation.ALS\n\nvar labelCol: Integer = _\nvar rank: Integer = _\nvar maxIterations: Integer = _\nvar regularizationParam: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n rank = z.get(\"Rank\").asInstanceOf[String].toInt\n maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\n regularizationParam = z.get(\"regularizationParam\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\nval lookup = Map(\"_c0\" -> \"userId\", \"_c1\" -> \"itemId\", \"_c2\" -> \"rating\")\n\n// Read Dataset from path\nvar df_rating = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Use df_rating as initial dataset (dataframe loaded) and assign column titles\nval ratings = df_rating.select(df_rating.columns.map(c => col(c).as(lookup.getOrElse(c, c))): _*)\n\n \nval Array(training, testing) = ratings.randomSplit(Array(0.8, 0.2)) //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n// Build the recommendation model using ALS on the training data\nval als = new ALS()\n .setMaxIter(maxIterations)\n .setRegParam(regularizationParam)\n //.setImplicitPrefs(true)\n .setUserCol(\"userId\")\n .setItemCol(\"itemId\")\n .setRatingCol(\"rating\")\n\n\n\n\ntry {\n // Fit the model\n val model = als.fit(training)\n\n // Evaluate the model by computing the RMSE on the test data\n // Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics\n model.setColdStartStrategy(\"drop\")\n val predictions = model.transform(testing)\n \n val evaluator = new RegressionEvaluator()\n .setMetricName(\"rmse\")\n .setLabelCol(\"rating\")\n .setPredictionCol(\"prediction\")\n val rmse = evaluator.evaluate(predictions)\n println(s\"Root-mean-square error = $rmse\")\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n \n try {\n model.save(datasetOutputPath + \"als_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Root-mean-square error (RMSE) = \" + rmse)\n \n z.put(\"displayDataframe\", false)\n z.run(\"20180327-125756_1304518740\")\n \n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n// Save and load model\n//model.save(sc, \"target/tmp/myCollaborativeFilter\")\n//val sameModel = MatrixFactorizationModel.load(sc, \"target/tmp/myCollaborativeFilter\")","dateUpdated":"2018-08-27T12:12:05+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.recommendation.ALS\nlabelCol: Integer = null\nrank: Integer = null\nmaxIterations: Integer = null\nregularizationParam: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\nlookup: scala.collection.immutable.Map[String,String] = Map(_c0 -> userId, _c1 -> itemId, _c2 -> rating)\njava.lang.IllegalArgumentException: Can not create a Path from a null string\n at org.apache.hadoop.fs.Path.checkPathArg(Path.java:159)\n at org.apache.hadoop.fs.Path.<init>(Path.java:175)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:349)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.immutable.List.foreach(List.scala:381)\n at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n at scala.collection.immutable.List.flatMap(List.scala:344)\n at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)\n at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401526_-448526203","id":"20180314-114730_1399433199","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:220"},{"title":"K-means","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.clustering.KMeans\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar numberOfClusters: Integer = _\nvar maxIterations: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n numberOfClusters = z.get(\"numberOfClusters\").asInstanceOf[String].toInt\n maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n \n// Train a k-means model.\nval kmeans = new KMeans()\n .setFeaturesCol(\"features\")\n .setK(numberOfClusters)\n .setMaxIter(maxIterations)\n .setSeed(1)\n\ntry{\n val model = kmeans.fit(assembler1)\n\n // Evaluate clustering by computing Within Set Sum of Squared Errors.\n val WSSSE = model.computeCost(assembler1)\n println(s\"Within Set Sum of Squared Errors = $WSSSE\")\n\n // Shows the result.\n //println(\"Cluster Centers: \")\n val centers = model.clusterCenters.toArray\n //centers.foreach(println)\n\n var datasetResult = model.transform(assembler1).select(\"features\",\"prediction\")\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n \n try {\n datasetResult.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"kmeans_dataset_\" + dateString + \".csv\")\n model.save(datasetOutputPath + \"kmeans_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n // Prepare result for display\n z.put(\"datasetResult\", datasetResult)\n z.put(\"displayDataframe\", true)\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Within Set Sum of Squared Errors (WSSSE) = \"+ WSSSE)\n z.angularBind(\"result2\", \"Cluster centers:\")\n z.angularBind(\"arrayObjects\", centers)\n z.run(\"20180327-125756_1304518740\") \n\n}catch{\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}","dateUpdated":"2018-08-27T12:12:20+0200","config":{"tableHide":true,"editorSetting":{"language":"scala","editOnDblClick":false},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.clustering.KMeans\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nnumberOfClusters: Integer = null\nmaxIterations: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\njava.lang.IllegalArgumentException: Can not create a Path from a null string\n at org.apache.hadoop.fs.Path.checkPathArg(Path.java:159)\n at org.apache.hadoop.fs.Path.<init>(Path.java:175)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:349)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.immutable.List.foreach(List.scala:381)\n at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n at scala.collection.immutable.List.flatMap(List.scala:344)\n at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)\n at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401526_-448526203","id":"20180314-120138_1441676951","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:221"},{"title":"Gaussian Mixtures","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.clustering.GaussianMixture\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar numberOfClusters: Integer = _\nvar maxIterations: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n numberOfClusters = z.get(\"numberOfClusters\").asInstanceOf[String].toInt\n maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n \nval gmm = new GaussianMixture()\n .setFeaturesCol(\"features\")\n .setK(numberOfClusters)\n\n \ntry{\n val model = gmm.fit(assembler1)\n\n // Evaluate clustering.\n val trf = model.transform(assembler1)\n\n\n // Shows the result.\n var helpArray = Array.ofDim[Double](model.getK, df_features.columns.length)\n //val temp = df_features.columns.length\n // output parameters of mixture model model\n for (i <- 0 until model.getK) {\n println(s\"Gaussian $i:\\nweight=${model.weights(i)}\\n\" +\n s\"mu=${model.gaussians(i).mean}\\nsigma=\\n${model.gaussians(i).cov}\\n\")\n helpArray(i)= model.gaussians(i).mean.toArray \n }\n var datasetResult = model.gaussiansDF\n \n val now = Calendar.getInstance().getTime()\n val intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\n val dateString = intFormat.format(now)\n\n try {\n datasetResult.rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"gaussianMixtures_dataset_\" + dateString + \".csv\")\n model.save(datasetOutputPath + \"gaussianMixtures_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n // Prepare result for display\n z.put(\"datasetResult\", datasetResult)\n z.put(\"displayDataframe\", true)\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Cluster centers (mean):\")\n z.angularBind(\"arrayObjects\", helpArray)\n z.run(\"20180327-125756_1304518740\") \n\n}catch{\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}","dateUpdated":"2018-08-27T12:12:40+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.clustering.GaussianMixture\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nnumberOfClusters: Integer = null\nmaxIterations: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\njava.lang.IllegalArgumentException: Can not create a Path from a null string\n at org.apache.hadoop.fs.Path.checkPathArg(Path.java:159)\n at org.apache.hadoop.fs.Path.<init>(Path.java:175)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:349)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.immutable.List.foreach(List.scala:381)\n at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n at scala.collection.immutable.List.flatMap(List.scala:344)\n at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)\n at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401526_-448526203","id":"20180314-120159_2006536130","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:222"},{"title":"Linear regression (OLS)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar maxIterations: Integer = _\nvar regularizationParam: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\n regularizationParam = z.get(\"regularizationParam\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n// Set up model\nval lr = new LinearRegression()\n .setFeaturesCol(\"features\")\n .setLabelCol(labelCol_str)\n .setMaxIter(maxIterations)\n .setRegParam(regularizationParam)\n\n// Set up evaluator\nval pipeline = new Pipeline().setStages(Array(lr))\nval evaluator = new RegressionEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"rmse\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n\nif (evalMethod == \"trainSplit\") {\n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val lrModel = lr.fit(assembler1)\n // Print the coefficients and intercept for linear regression\n println(s\"Coefficients: ${lrModel.coefficients}, Intercept: ${lrModel.intercept}\")\n \n val trainingSummary = lrModel.summary\n // trainingSummary.residuals.show()\n // println(s\"RMSE: ${trainingSummary.rootMeanSquaredError}\")\n // println(s\"r2: ${trainingSummary.r2}\")\n \n // Select (prediction, true label) and compute rmse.\n val predictions = lrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n\n //for i in df_features.columns:\n //print( \"Correlation to total_damage for \", i, df_features.stat.corr(labelCol_str,i))\n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"ols_tt_predict_\" + dateString + \".csv\")\n lrModel.save(datasetOutputPath + \"ols_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Coefficients: \" + lrModel.coefficients + \", Intercept: \" + lrModel.intercept)\n z.angularBind(\"result2\", \"Root-mean-square-error (RMSE) = \" + rmse)\n z.angularBind(\"arrayObjects\", Array())\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n \n val lrModel: LinearRegressionModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[LinearRegressionModel]\n val avgResult = cvModel.avgMetrics(0)\n // Select (prediction, true label) and compute rmse.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = lrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n\n println(s\"Coefficients: ${lrModel.coefficients}, Intercept: ${lrModel.intercept}\")\n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"ols_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"ols_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \" Best model Coefficients: \" + lrModel.coefficients + \", Intercept: \" + lrModel.intercept)\n z.angularBind(\"result2\", \"Avg. Root-mean-square-error (RMSE) = \" + avgResult)\n z.angularBind(\"arrayObjects\", Array())\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n}\n","user":"anonymous","dateUpdated":"2018-08-28T12:06:08+0200","config":{"lineNumbers":true,"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nmaxIterations: Integer = null\nregularizationParam: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 1\nlabelCol_str: String = _c1\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 1 more field]\nlr: org.apache.spark.ml.regression.LinearRegression = linReg_31f8c8297858\npipeline: org.apache.spark.ml.Pipeline = pipeline_a920a664807e\nevaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_bfc9e3af5ab8\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nevalMethod: String = trainSplit\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\nnow: java.util.Date = Tue Aug 28 17:37:18 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_053718\nCoefficients: [2539.2670351519914], Intercept: 1.8575234235674053E7\n"}]},"apps":[],"jobName":"paragraph_1532679401526_-448526203","id":"20180314-121848_2079236016","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T17:37:03+0200","dateFinished":"2018-08-28T17:37:29+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:223"},{"title":"Generalized Linear Regression (GLM)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.regression.GeneralizedLinearRegression\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar distFamily: String = _\nvar maxIterations: Integer = _\nvar regularizationParam: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n distFamily = z.get(\"distFamily\").asInstanceOf[String]\n maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\n regularizationParam = z.get(\"regularizationParam\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n// Set up model\nval glr = new GeneralizedLinearRegression()\n .setFamily(distFamily)\n .setFeaturesCol(\"features\")\n .setLabelCol(labelCol_str)\n .setMaxIter(maxIterations)\n .setRegParam(regularizationParam)\n\n// Set up evaluator\nval pipeline = new Pipeline().setStages(Array(lr))\nval evaluator = new RegressionEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"rmse\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n\nif (evalMethod == \"trainSplit\") {\n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val lrModel = lr.fit(assembler1)\n // Print the coefficients and intercept for linear regression\n println(s\"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}\")\n \n val trainingSummary = lrModel.summary\n // trainingSummary.residuals.show()\n // println(s\"RMSE: ${trainingSummary.rootMeanSquaredError}\")\n // println(s\"r2: ${trainingSummary.r2}\")\n \n // Select (prediction, true label) and compute rmse.\n val predictions = lrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"glm_tt_predict_\" + dateString + \".csv\")\n lrModel.save(datasetOutputPath + \"glm_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Coefficients: \" + lrModel.coefficients + \" Intercept: \" + lrModel.intercept)\n z.angularBind(\"result2\", \"Root-mean-square-error (RMSE) = \" + rmse)\n z.angularBind(\"arrayObjects\", Array())\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n \n val lrModel: LinearRegressionModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[LinearRegressionModel]\n \n val avgResult = cvModel.avgMetrics(0)\n \n // Select (prediction, true label) and compute rmse.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = lrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"glm_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"glm_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Best model Coefficients: \" + lrModel.coefficients + \", Intercept: \" + lrModel.intercept)\n z.angularBind(\"result2\", \"Avg. Root-mean-square-error (RMSE) = \" + avgResult)\n z.angularBind(\"arrayObjects\", Array())\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n}","user":"anonymous","dateUpdated":"2018-08-28T13:08:26+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","fontSize":9,"editorHide":true,"results":{},"enabled":true,"title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.regression.GeneralizedLinearRegression\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\ndistFamily: String = null\nmaxIterations: Integer = null\nregularizationParam: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 1\nlabelCol_str: String = _c1\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 1 more field]\nglr: org.apache.spark.ml.regression.GeneralizedLinearRegression = glm_72a1c0977c5c\npipeline: org.apache.spark.ml.Pipeline = pipeline_055ddd1a8319\nevaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_79ab6cc996c5\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nevalMethod: String = trainSplit\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\nnow: java.util.Date = Tue Aug 28 12:04:39 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_120439\nCoefficients: [2539.2670351519914] Intercept: 1.8575234235674053E7\n"}]},"apps":[],"jobName":"paragraph_1532679401528_-450834696","id":"20180327-125233_1985102579","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T12:04:34+0200","dateFinished":"2018-08-28T12:04:44+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:224"},{"title":"Decision trees Regression (DTR)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.regression.{DecisionTreeRegressor,DecisionTreeRegressionModel}\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtr = new DecisionTreeRegressor()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtr)) \nval evaluator = new RegressionEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"rmse\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n\n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = 100.0\nvar test: Double = 0.0\nvar xCrossValidation: Int = 0\nif (evalMethod == \"trainSplit\") {\n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtrModel = dtr.fit(assembler1)\n\n // Select (prediction, true label) and compute rmse.\n val predictions = dtrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"decisionTreeRegression_tt_predict_\" + dateString + \".csv\")\n dtrModel.save(datasetOutputPath + \"decisionTreeRegression_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Root-mean-square-error (RMSE) = \" + rmse)\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n \n val avgResult = cvModel.avgMetrics(0)\n val dtrModel: DecisionTreeRegressionModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[DecisionTreeRegressionModel]\n \n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = dtrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions) \n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"decisionTreeRegression_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"decisionTreeRegression_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Avg. Root-mean-square-error (RMSE) = \" + avgResult)\n //z.angularBind(\"arrayObjects\", cvModel.avgMetrics)\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n } \n}\n","user":"anonymous","dateUpdated":"2018-08-28T13:08:33+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 1\nlabelCol_str: String = _c1\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 1 more field]\ndtr: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_0ffcfd341ae8\npipeline: org.apache.spark.ml.Pipeline = pipeline_2f6362b67da4\nevaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_b2657170260f\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 12:18:01 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_121801\nevalMethod: String = crossValidation\ntrain: Double = 100.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1532679401527_-448910952","id":"20180314-121851_102488565","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T12:17:57+0200","dateFinished":"2018-08-28T12:18:14+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:225"},{"title":"Decision trees Classifier (DTC)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{DecisionTreeClassificationModel,DecisionTreeClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtc = new DecisionTreeClassifier()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtc)) \nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"f1\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n \n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nif (evalMethod == \"trainSplit\") {\n \n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtcModel = dtc.fit(assembler1)\n //Evaluation\n val predictions = dtcModel.transform(testData)\n // Select (prediction, true label) and compute test error.\n \n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"decisionTreeClassifier_tt_predict_\" + dateString + \".csv\")\n dtcModel.save(datasetOutputPath + \"decisionTreeClassifier_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Accuracy = \" + (100*accuracy)+\"%\")\n z.angularBind(\"result2\", \"Test Error = \" + (1.0 - accuracy))\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n val dtcModel: DecisionTreeClassificationModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[DecisionTreeClassificationModel]\n val avgResult = cvModel.avgMetrics(0)\n // Select (prediction, true label) and compute rmse.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = dtcModel.transform(testData)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"decisionTreeClassifier_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"decisionTreeClassifier_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Avg. Accuracy = \" + (100*avgResult)+\"%\")\n z.angularBind(\"result2\", \"Avg. Test Error = \" + (1.0 - avgResult))\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n \n}","user":"anonymous","dateUpdated":"2018-08-28T14:54:52+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 4 more fields]\ndtc: org.apache.spark.ml.classification.DecisionTreeClassifier = dtc_5b0a205107eb\npipeline: org.apache.spark.ml.Pipeline = pipeline_9db0efae4ec3\nevaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_bad24f9c8d58\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 13:28:02 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_012802\nevalMethod: String = crossValidation\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1532679401527_-448910952","id":"20180314-121852_1052604787","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T13:27:59+0200","dateFinished":"2018-08-28T13:28:17+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:226"},{"title":"Random Forest Regressor (RFR)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.regression.{RandomForestRegressor,RandomForestRegressionModel}\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar numTrees: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n numTrees = z.get(\"numTrees\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtr = new RandomForestRegressor()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n .setNumTrees(numTrees)\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtr)) \nval evaluator = new RegressionEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"rmse\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n\n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = 100.0\nvar test: Double = 0.0\nvar xCrossValidation: Int = 0\nif (evalMethod == \"trainSplit\") {\n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtrModel = dtr.fit(assembler1)\n\n // Select (prediction, true label) and compute rmse.\n val predictions = dtrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"randomForestRegression_tt_predict_\" + dateString + \".csv\")\n dtrModel.save(datasetOutputPath + \"randomForestRegression_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Root-mean-square-error (RMSE) = \" + rmse)\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n \n val avgResult = cvModel.avgMetrics(0)\n val dtrModel: RandomForestRegressionModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[RandomForestRegressionModel]\n \n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = dtrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions) \n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"randomForestRegression_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"randomForestRegression_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Avg. Root-mean-square-error (RMSE) = \" + avgResult)\n //z.angularBind(\"arrayObjects\", cvModel.avgMetrics)\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n } \n}\n","user":"anonymous","dateUpdated":"2018-08-28T14:54:47+0200","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"scala"},"editorMode":"ace/mode/scala","title":true,"editorHide":true,"tableHide":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.regression.{RandomForestRegressor, RandomForestRegressionModel}\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\nnumTrees: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 1\nlabelCol_str: String = _c1\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 1 more field]\ndtr: org.apache.spark.ml.regression.RandomForestRegressor = rfr_5b4d453e6758\npipeline: org.apache.spark.ml.Pipeline = pipeline_335fcce31f21\nevaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_1c3638a9202e\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 13:00:51 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_010051\nevalMethod: String = trainSplit\ntrain: Double = 100.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1535451652526_257355296","id":"20180828-122052_380805630","dateCreated":"2018-08-28T12:20:52+0200","dateStarted":"2018-08-28T13:00:47+0200","dateFinished":"2018-08-28T13:00:58+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:227"},{"title":"Random Forest Classifier (RFC)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar numTrees: Integer = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n numTrees = z.get(\"numTrees\").asInstanceOf[String].toInt\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtc = new RandomForestClassifier()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n .setNumTrees(numTrees)\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtc)) \nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"f1\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n \n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nif (evalMethod == \"trainSplit\") {\n \n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtcModel = dtc.fit(assembler1)\n //Evaluation\n val predictions = dtcModel.transform(testData)\n // Select (prediction, true label) and compute test error.\n \n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"randomForestClassifier_tt_predict_\" + dateString + \".csv\")\n dtcModel.save(datasetOutputPath + \"randomForestClassifier_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Accuracy = \" + (100*accuracy)+\"%\")\n z.angularBind(\"result2\", \"Test Error = \" + (1.0 - accuracy))\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n val avgResult = cvModel.avgMetrics(0)\n \n val dtcModel: RandomForestClassificationModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[RandomForestClassificationModel]\n // Select (prediction, true label) and compute accuracy.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = dtcModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"randomForestClassifier_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"randomForestClassifier_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Avg. Accuracy = \" + (100*avgResult)+\"%\")\n z.angularBind(\"result2\", \"Avg. Test Error = \" + (1.0 - avgResult))\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n \n}","user":"anonymous","dateUpdated":"2018-08-28T14:54:10+0200","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"scala"},"editorMode":"ace/mode/scala","title":true,"editorHide":true,"tableHide":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\nnumTrees: Integer = null\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 4 more fields]\ndtc: org.apache.spark.ml.classification.RandomForestClassifier = rfc_7689b5fb20a8\npipeline: org.apache.spark.ml.Pipeline = pipeline_cc0fe6ca4fb0\nevaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_85ab84812ce1\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 14:42:26 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_024226\nevalMethod: String = crossValidation\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1535454769960_1598830215","id":"20180828-131249_1439234059","dateCreated":"2018-08-28T13:12:49+0200","dateStarted":"2018-08-28T14:42:22+0200","dateFinished":"2018-08-28T14:42:42+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:228"},{"title":"Gradient-boosted tree Regression (GBTR)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtr = new GBTRegressor()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtr)) \nval evaluator = new RegressionEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"rmse\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n\n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = 100.0\nvar test: Double = 0.0\nvar xCrossValidation: Int = 0\nif (evalMethod == \"trainSplit\") {\n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtrModel = dtr.fit(assembler1)\n\n // Select (prediction, true label) and compute rmse.\n val predictions = dtrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"gbtRegression_tt_predict_\" + dateString + \".csv\")\n dtrModel.save(datasetOutputPath + \"gbtRegression_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Root-mean-square-error (RMSE) = \" + rmse)\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n \n val avgResult = cvModel.avgMetrics(0)\n val dtrModel: GBTRegressionModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[GBTRegressionModel]\n \n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = dtrModel.transform(testData)\n val rmse = evaluator.evaluate(predictions) \n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"gbtRegression_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"gbtRegression_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"\")\n z.angularBind(\"result2\", \"Avg. Root-mean-square-error (RMSE) = \" + avgResult)\n //z.angularBind(\"arrayObjects\", cvModel.avgMetrics)\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n } \n}\n","user":"anonymous","dateUpdated":"2018-08-28T13:08:54+0200","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"scala"},"editorMode":"ace/mode/scala","title":true,"editorHide":true,"tableHide":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.evaluation.RegressionEvaluator\nimport org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}\nimport org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}\nimport org.apache.spark.ml.feature.VectorAssembler\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 1\nlabelCol_str: String = _c1\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 1 more field]\ndtr: org.apache.spark.ml.regression.GBTRegressor = gbtr_150a17358199\npipeline: org.apache.spark.ml.Pipeline = pipeline_b582dac78ec5\nevaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = regEval_307b6e900873\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 13:01:22 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_010122\nevalMethod: String = trainSplit\ntrain: Double = 100.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1535453613224_953002089","id":"20180828-125333_2013070982","dateCreated":"2018-08-28T12:53:33+0200","dateStarted":"2018-08-28T13:01:18+0200","dateFinished":"2018-08-28T13:01:42+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:229"},{"title":"Gradient-boosting tree classifier (GBTC)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtc = new GBTClassifier()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtc)) \nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"f1\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n \n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nif (evalMethod == \"trainSplit\") {\n \n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtcModel = dtc.fit(assembler1)\n //Evaluation\n val predictions = dtcModel.transform(testData)\n // Select (prediction, true label) and compute test error.\n \n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"gbtClassifier_tt_predict_\" + dateString + \".csv\")\n dtcModel.save(datasetOutputPath + \"decisionTreeClassifier_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Accuracy = \" + (100*accuracy)+\"%\")\n z.angularBind(\"result2\", \"Test Error = \" + (1.0 - accuracy))\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n val avgResult = cvModel.avgMetrics(0)\n \n val dtcModel: GBTClassificationModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[GBTClassificationModel]\n // Select (prediction, true label) and compute accuracy.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = dtcModel.transform(testData)\n val rmse = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"gbtClassifier_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"gbtClassifier_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Avg. Accuracy = \" + (100*avgResult)+\"%\")\n z.angularBind(\"result2\", \"Avg. Test Error = \" + (1.0 - avgResult))\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n \n}","user":"anonymous","dateUpdated":"2018-08-28T14:54:02+0200","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"scala"},"editorMode":"ace/mode/scala","title":true,"editorHide":true,"tableHide":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 4 more fields]\ndtc: org.apache.spark.ml.classification.GBTClassifier = gbtc_3e1de44bdd9c\npipeline: org.apache.spark.ml.Pipeline = pipeline_d748a3c86bce\nevaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_3d71d4544e09\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 14:43:12 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_024312\nevalMethod: String = crossValidation\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1535454773667_1415705063","id":"20180828-131253_989652806","dateCreated":"2018-08-28T13:12:53+0200","dateStarted":"2018-08-28T14:43:08+0200","dateFinished":"2018-08-28T14:43:13+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:230"},{"title":"SVM","text":"// Linear Support Vector Machines (binary classification only)\nimport org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}\nimport org.apache.spark.mllib.evaluation.BinaryClassificationMetrics\nimport org.apache.spark.mllib.util.MLUtils\n\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{DecisionTreeClassificationModel,DecisionTreeClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\n\nvar labelCol: Integer = _\nvar maxDepth: Integer = _\nvar maxBins: Integer = _\nvar minInstancesPerNode: Integer = _\nvar minInfoGain: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxDepth = z.get(\"maxDepth\").asInstanceOf[String].toInt\n maxBins = z.get(\"maxBins\").asInstanceOf[String].toInt\n minInstancesPerNode = z.get(\"minInstancesPerNode\").asInstanceOf[String].toInt\n minInfoGain = z.get(\"minInfoGain\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create a DecisionTree model.\nval dtc = new DecisionTreeClassifier()\n .setLabelCol(labelCol_str)\n .setFeaturesCol(\"features\")\n .setMaxDepth(maxDepth)\n .setMaxBins(maxBins)\n .setMinInstancesPerNode(minInstancesPerNode)\n .setMinInfoGain(minInfoGain)\n \n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(dtc)) \nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"f1\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n \n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nif (evalMethod == \"trainSplit\") {\n \n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtcModel = dtc.fit(assembler1)\n //Evaluation\n val predictions = dtcModel.transform(testData)\n // Select (prediction, true label) and compute test error.\n \n val accuracy = evaluator.evaluate(predictions)\n \n \n // dtcModel\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Accuracy = \" + (100*accuracy)+\"%\")\n z.angularBind(\"result2\", \"Test Error = \" + (1.0 - accuracy))\n \n z.put(\"displayDataframe\", false)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = 5//z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n val avgResult = cvModel.avgMetrics(0)\n \n // cvModel\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Avg. Accuracy = \" + (100*avgResult)+\"%\")\n z.angularBind(\"result2\", \"Avg. Test Error = \" + (1.0 - avgResult))\n //println(avgResult)\n z.put(\"displayDataframe\", false)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n \n}\n\n\n\nval labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\nval maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\nval regularizationParam = z.get(\"regularizationParam\").asInstanceOf[String].toDouble\nval convergenceTolerance = z.get(\"convergenceTolerance\").asInstanceOf[String].toDouble\n\nprintln(labelCol)\nprintln(regularizationParam)\nprintln(maxIterations)\nprintln(convergenceTolerance)\n\n\n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = 100.0\nvar test: Double = 0.0\nvar xCrossValidation: Int = 0\nif (evalMethod == \"trainSplit\") {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val dtrModel = dtr.fit(assembler1)\n // Print the coefficients and intercept for linear regression\n println(s\"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}\")\n \n // Summarize the model over the training set and print out some metrics\n val trainingSummary = dtrModel.summary\n println(s\"numIterations: ${trainingSummary.totalIterations}\")\n println(s\"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(\",\")}]\")\n trainingSummary.residuals.show()\n println(s\"RMSE: ${trainingSummary.rootMeanSquaredError}\")\n println(s\"r2: ${trainingSummary.r2}\")\n \n //or\n val predictions = model.transform(testData)\n // Select (prediction, true label) and compute test error.\n val evaluator = new RegressionEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"rmse\")\n val rmse = evaluator.evaluate(predictions)\n println(\"Root Mean Squared Error (RMSE) on test data = \" + rmse)\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n val paramGrid = new ParamGridBuilder()\n .addGrid(dtr.regParam, Array(0.1, 0.01))\n .build()\n // Fit the model\n val cv = new CrossValidator()\n .setEstimator(dtr)\n .setEvaluator(new BinaryClassificationEvaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) // Use 3+ in practice\n \n val cvModel = cv.fit(assembler1) \n}","dateUpdated":"2018-08-28T12:59:02+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":true,"fontSize":9,"title":false,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"ERROR","msg":[{"type":"TEXT","data":"import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}\nimport org.apache.spark.mllib.evaluation.BinaryClassificationMetrics\nimport org.apache.spark.mllib.util.MLUtils\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\nlabelCol: Integer = null\nmaxDepth: Integer = null\nmaxBins: Integer = null\nminInstancesPerNode: Integer = null\nminInfoGain: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\njava.lang.IllegalArgumentException: Can not create a Path from a null string\n at org.apache.hadoop.fs.Path.checkPathArg(Path.java:159)\n at org.apache.hadoop.fs.Path.<init>(Path.java:175)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:349)\n at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$14.apply(DataSource.scala:348)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n at scala.collection.immutable.List.foreach(List.scala:381)\n at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n at scala.collection.immutable.List.flatMap(List.scala:344)\n at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:348)\n at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)\n at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)\n ... 48 elided\n"}]},"apps":[],"jobName":"paragraph_1532679401527_-448910952","id":"20180314-121853_1193799050","dateCreated":"2018-07-27T10:16:41+0200","status":"READY","errorMessage":"","progressUpdateIntervalMs":500,"$$hashKey":"object:231"},{"title":"Multi-layer Perceptron (MLP)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\n\nvar labelCol: Integer = _\nvar maxIterations: Integer = _\nvar step: Double = _\nvar convergenceTolerance: Double = _\nvar seed: Long = _\nvar layers: IndexedSeq[String] = _\nvar intLayers: Array[Int] = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n maxIterations = z.get(\"maxIterations\").asInstanceOf[String].toInt\n step = z.get(\"step\").asInstanceOf[String].toDouble\n convergenceTolerance = z.get(\"convergenceTolerance\").asInstanceOf[String].toDouble\n seed = z.get(\"seed\").asInstanceOf[String].toLong\n // specify layers for the neural network:\n // input layer of size 4 (features), two intermediate of size 5 and 4\n // and output of size 3 (classes)\n layers = z.get(\"layers\").asInstanceOf[String].split(\",\")\n intLayers = layers.map(_.toInt).toArray\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n\n\n// Create n MLP model.\n// create the trainer and set its parameters\nval mlp = new MultilayerPerceptronClassifier()\n .setFeaturesCol(\"features\")\n .setLabelCol(labelCol_str)\n .setLayers(intLayers)\n .setSeed(seed)\n .setMaxIter(maxIterations)\n .setStepSize(step)\n .setTol(convergenceTolerance)\n\n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(mlp)) \nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"f1\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n \n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nif (evalMethod == \"trainSplit\") {\n \n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val mlpModel = mlp.fit(assembler1)\n //Evaluation\n val predictions = mlpModel.transform(testData)\n // Select (prediction, true label) and compute test error.\n \n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"mlp_tt_predict_\" + dateString + \".csv\")\n mlpModel.save(datasetOutputPath + \"mlp_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Accuracy = \" + (100*accuracy)+\"%\")\n z.angularBind(\"result2\", \"Test Error = \" + (1.0 - accuracy))\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n val mlpModel: MultilayerPerceptronClassificationModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[MultilayerPerceptronClassificationModel]\n val avgResult = cvModel.avgMetrics(0)\n // Select (prediction, true label) and compute rmse.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = mlpModel.transform(testData)\n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"mlp_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"mlp_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Avg. Accuracy = \" + (100*avgResult)+\"%\")\n z.angularBind(\"result2\", \"Avg. Test Error = \" + (1.0 - avgResult))\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n \n}","user":"anonymous","dateUpdated":"2018-08-28T15:34:07+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":false,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{MultilayerPerceptronClassifier, MultilayerPerceptronClassificationModel}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\nlabelCol: Integer = null\nmaxIterations: Integer = null\nstep: Double = 0.0\nconvergenceTolerance: Double = 0.0\nseed: Long = 0\nlayers: IndexedSeq[String] = null\nintLayers: Array[Int] = null\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 4 more fields]\nmlp: org.apache.spark.ml.classification.MultilayerPerceptronClassifier = mlpc_b8f00e24ae2a\npipeline: org.apache.spark.ml.Pipeline = pipeline_fe89d6ded7ee\nevaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_a03d68fe4f4d\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 15:34:51 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_033451\nevalMethod: String = crossValidation\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1532679401527_-448910952","id":"20180314-121854_1204567882","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T15:34:46+0200","dateFinished":"2018-08-28T15:35:17+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:232"},{"title":"Naïve Bayes (NB)","text":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\n\nvar labelCol: Integer = _\nvar smoothing: Double = _\nvar datasetPath: String = _\nvar datasetSeparator: String = _\nvar datasetOutputPath: String = _\n\ntry {\n labelCol = z.get(\"labelCol\").asInstanceOf[String].toInt\n smoothing = z.get(\"smoothing\").asInstanceOf[String].toDouble\n datasetPath = z.get(\"datasetPath\").asInstanceOf[String]\n datasetSeparator = z.get(\"datasetSeparator\").asInstanceOf[String]\n datasetOutputPath = z.get(\"datasetOutputPath\").asInstanceOf[String]\n} catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n}\n\n\n// Get column with labels\nvar labelColReal=labelCol-1\nvar labelCol_str = \"_c\"+labelColReal.toString\n\n// Read Dataset from path\nvar df = spark.read\n .option(\"sep\", datasetSeparator)\n .option(\"inferSchema\", \"true\")\n .option(\"header\", \"false\")\n .csv(datasetPath)\n\n// Assign df with features\nvar df_features = df\n\n\n// If there is a column with labels, drop it\nif (labelColReal>=0){\n df_features = df.drop(labelCol_str)\n}\n\nval assembler1 = new VectorAssembler()\n .setInputCols(df_features.columns)\n .setOutputCol(\"features\")\n .transform(df)\n \n\n\n// Create a NaiveBayes model.\nval model = new NaiveBayes()\n .setFeaturesCol(\"features\")\n .setLabelCol(labelCol_str)\n .setSmoothing(smoothing)\n\n\n\n\n// Select (prediction, true label) and compute test error\n// val evaluator = new MulticlassClassificationEvaluator()\n// .setLabelCol(labelCol_str)\n// .setPredictionCol(\"prediction\")\n// .setMetricName(\"accuracy\")\n// val accuracy = evaluator.evaluate(predictions)\n// println(\"Test set accuracy = \" + accuracy)\n\n// Setup evaluator\nval pipeline = new Pipeline().setStages(Array(model)) \nval evaluator = new MulticlassClassificationEvaluator()\n .setLabelCol(labelCol_str)\n .setPredictionCol(\"prediction\")\n .setMetricName(\"accuracy\")\nval paramGrid = new ParamGridBuilder().build() // No parameter search\n\nval now = Calendar.getInstance().getTime()\nval intFormat = new SimpleDateFormat(\"yyyyMMdd_hhmmss\")\nval dateString = intFormat.format(now)\n \n// Get evaluation method\nval evalMethod = z.get(\"evalMethod\").asInstanceOf[String]\nvar train: Double = _\nvar test: Double = _\nvar xCrossValidation: Int = _\n\nif (evalMethod == \"trainSplit\") {\n \n try {\n train = z.get(\"train\").asInstanceOf[String].toDouble / 100\n test = z.get(\"test\").asInstanceOf[String].toDouble / 100\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n try {\n var Array(trainingData, testData) = assembler1.randomSplit(Array(train, test))\n // Fit the model\n val nbModel = model.fit(assembler1)\n //Evaluation\n val predictions = nbModel.transform(testData)\n // Select (prediction, true label) and compute test error.\n \n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", \"probability\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"naiveBayes_tt_predict_\" + dateString + \".csv\")\n nbModel.save(datasetOutputPath + \"naiveBayes_tt_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n\n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Accuracy = \" + (100*accuracy)+\"%\")\n z.angularBind(\"result2\", \"Test Error = \" + (1.0 - accuracy))\n \n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", \"probability\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n \n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n} else {\n try {\n xCrossValidation = z.get(\"xCrossValidation\").asInstanceOf[String].toInt\n // We use a ParamGridBuilder to construct a grid of parameters to search over.\n // With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,\n // this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.\n // val paramGrid = new ParamGridBuilder()\n // .addGrid(dtc.regParam, Array(0.1, 0.01))\n // .build()\n // // Fit the model\n val cv = new CrossValidator()\n .setEstimator(pipeline)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setNumFolds(xCrossValidation) \n \n val cvModel = cv.fit(assembler1) \n val nbModel: NaiveBayesModel = cvModel\n .bestModel.asInstanceOf[PipelineModel]\n .stages\n .last.asInstanceOf[NaiveBayesModel]\n val avgResult = cvModel.avgMetrics(0)\n // Select (prediction, true label) and compute rmse.\n var Array(trainingData, testData) = assembler1.randomSplit(Array(0.8, 0.2), seed = 12345)\n val predictions = nbModel.transform(testData)\n val accuracy = evaluator.evaluate(predictions)\n \n try {\n predictions.select (\"features\", \"prediction\", \"probability\", labelCol_str).rdd.map(_.toString().replace(\"[\",\"\").replace(\"]\", \"\")).saveAsTextFile(datasetOutputPath + \"naiveBayes_cv_predict_\" + dateString + \".csv\")\n cvModel.save(datasetOutputPath + \"naiveBayes_cv_model_\" + dateString + \".csv\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", \"Output path \" + datasetOutputPath + \" does not exist or you don't have permission\")\n }\n }\n \n z.angularBind(\"result\", true)\n z.angularBind(\"result1\", \"Avg. Accuracy = \" + (100*avgResult)+\"%\")\n z.angularBind(\"result2\", \"Avg. Test Error = \" + (1.0 - avgResult))\n //println(avgResult)\n z.put(\"datasetResult\", predictions.select (\"features\", \"prediction\", \"probability\", labelCol_str))\n z.put(\"displayDataframe\", true)\n z.run(\"20180327-125756_1304518740\")\n } catch {\n case e: Exception => {\n z.angularBind(\"error\", true)\n z.angularBind(\"errorMessage\", e.getMessage)\n }\n }\n \n \n}","user":"anonymous","dateUpdated":"2018-08-28T15:27:27+0200","config":{"tableHide":true,"editorSetting":{"language":"scala"},"colWidth":12,"editorMode":"ace/mode/scala","editorHide":false,"fontSize":9,"title":true,"results":{},"enabled":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"import java.util.Calendar\nimport java.text.SimpleDateFormat\nimport org.apache.spark.ml.Pipeline\nimport org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder}\nlabelCol: Integer = null\nsmoothing: Double = 0.0\ndatasetPath: String = null\ndatasetSeparator: String = null\ndatasetOutputPath: String = null\nlabelColReal: Int = 4\nlabelCol_str: String = _c4\ndf: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\ndf_features: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 3 more fields]\nassembler1: org.apache.spark.sql.DataFrame = [_c0: double, _c1: double ... 4 more fields]\nmodel: org.apache.spark.ml.classification.NaiveBayes = nb_e346a7e6ef54\npipeline: org.apache.spark.ml.Pipeline = pipeline_8c1531c43eb4\nevaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_7f41db85afe1\nparamGrid: Array[org.apache.spark.ml.param.ParamMap] =\nArray({\n\n})\nnow: java.util.Date = Tue Aug 28 15:20:23 CEST 2018\nintFormat: java.text.SimpleDateFormat = java.text.SimpleDateFormat@b3d52dbf\ndateString: String = 20180828_032023\nevalMethod: String = crossValidation\ntrain: Double = 0.0\ntest: Double = 0.0\nxCrossValidation: Int = 0\n"}]},"apps":[],"jobName":"paragraph_1532679401527_-448910952","id":"20180314-121854_1598665997","dateCreated":"2018-07-27T10:16:41+0200","dateStarted":"2018-08-28T15:20:20+0200","dateFinished":"2018-08-28T15:20:35+0200","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:233"}],"name":"Aegis","id":"2DNTQAG7C","angularObjects":{"2CF9AMQ9Q::2DNTQAG7C":[],"2CHUQQW33::2DNTQAG7C":[],"2CEZ1XY26::2DNTQAG7C":[],"2CGD8H8TP::2DNTQAG7C":[],"2CRSX9NDY::2DNTQAG7C":[],"2CFZ6Q3A2::2DNTQAG7C":[{"name":"result","object":true,"noteId":"2DNTQAG7C"},{"name":"isDatasetResultPopulated","object":false,"noteId":"2DNTQAG7C"},{"name":"algorithmFamilies","object":[{"id":0,"name":"DIMENSIONALITY REDUCTION/FEATURE EXTRACTION/SELECTION","algorithms":[{"id":"pca","name":"PCA","information":"Principal Component Analysis (PCA) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a NEW set of values of linearly uncorrelated variables called principal components. Output is named pcaFeatures. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"numTopFeatures","kind":"Integer","information":"Number of top features","value":"1"}],"paragraphId":"20180314-100351_1767145110"},{"id":"chiSquared","name":"ChiSquared","information":"Chi-Squared feature selection operates on labeled data with categorical features. It uses the Chi-Squared test of independence to decide which features to choose. Output is named “chiFeatures”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"numTopFeatures","kind":"Integer","information":"Number of top features","value":"1"}],"paragraphId":"20180314-113514_1190037522"}]},{"id":1,"name":"NLP FUNCTIONS","algorithms":[{"id":"tokenizer","name":"Tokenizer","information":"Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). Here we use RegexTokenizer that converts the input string to lowercase, removes stopwords and then splits it by white spaces. Output is named 'tokensOut'. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"}],"paragraphId":"20180314-113929_1964205303"},{"id":"nGram","name":"n-gram","information":"An n-gram is a sequence of n tokens (typically words) for some integer n. This function can be used to transform input features into n-grams, taking as input a sequence of strings (e.g. the output of the Tokenizer) and the output will consist of a sequence of n-grams where each n-gram is represented by a space-delimited string of n consecutive words. Output is named “ngramsOut”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"numOfTerms","kind":"Integer","information":"Number of Terms","value":"2"}],"paragraphId":"20180314-114545_1909431260"},{"id":"tfIdf","name":"TF-IDF","information":"Term frequency-inverse document frequency (TF-IDF) is a feature vectorization method widely used in text mining to reflect the importance of a term to a document in a corpus. Output is named “tfidfOut”. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-features.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"numOfTopFeatures","kind":"Integer","information":"Number of Top features","value":"2"},{"name":"minDocFrequency","kind":"Integer","information":"minDocFrequency","value":"0"}],"paragraphId":"20180314-114722_563562800"}]},{"id":2,"name":"RECOMMENDERS","algorithms":[{"id":"als","name":"Collaborative Filtering (ALS)","information":"Collaborative Filtering produces recommendations based on what similar users like and aims to fill in the missing entries of a user-item- rating association matrix. Alternating Least Squares (ALS) matrix factorization is commonly used as a collaborative filtering algorithm. ALS models the rating matrix (R) as the multiplication of low-rank user (U) and product (V) factors and learns these factors by minimizing the reconstruction error of the observed ratings in an iterative procedure. Input data should contain 3 columns userCol(user ids), ItemCol (item ids), RatingCol (rating). <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"Rank","kind":"Integer","information":"Rank","value":"10"},{"name":"maxIterations","kind":"Integer","information":"Max Iterations","value":"10"},{"name":"regularizationParam","kind":"Float","information":"regularizationParam","value":"0.0"}],"paragraphId":"20180314-114730_1399433199"}]},{"id":3,"name":"CLUSTERING","algorithms":[{"id":"kMeans","name":"k-means","information":"It is one of the most commonly used clustering algorithms that clusters the data points into a predefined number of clusters. The algorithm generates a model. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"numberOfClusters","kind":"Integer","information":"Number of clusters","value":"2"},{"name":"maxIterations","kind":"Integer","information":"Maximum Iterations","value":"20"}],"paragraphId":"20180314-120138_1441676951"},{"id":"gaussianMixtures","name":"Gaussian Mixtures","information":"A Gaussian Mixture Model represents a composite distribution whereby points are drawn from one of k Gaussian sub-distributions, each with its own probability. This implementation uses the expectation-maximization algorithm to induce the maximum-likelihood model given a set of samples. The algorithm generates a model, with the predicted cluster center and probability of each cluster. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-clustering.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"numberOfClusters","kind":"Integer","information":"Number of clusters","value":"2"},{"name":"maxIterations","kind":"Integer","information":"Maximum Iterations","value":"20"}],"paragraphId":"20180314-120159_2006536130"}]},{"id":4,"name":"CLASSIFICATION/REGRESSION","algorithms":[{"id":"linearRegression","name":"Linear regression (OLS)","information":"Ordinary Least squares (OLS) is the simplest and most common linear regressor. The learning objective of OLS is to minimize the sum of squared residuals, in order to estimate the coefficients of the linear regression expression. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxIterations","kind":"Integer","information":"Max Iterations","value":"10"},{"name":"regularizationParam","kind":"Float","information":"regularizationParam","value":"0.3"}],"paragraphId":"20180314-121848_2079236016"},{"id":"glm","name":"Generalized Linear Models (GLM)","information":"Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLM) are specifications of linear models where the response variable follows some distribution from the exponential family of distributions. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxIterations","kind":"Integer","information":"Max Iterations","value":"10"},{"name":"regularizationParam","kind":"Float","information":"regularizationParam","value":"0.3"},{"name":"distFamily","kind":"String","information":"Select Distribution Family (poisson, gaussian, binomial, gamma, tweedie)","value":"poisson"}],"paragraphId":"20180327-125233_1985102579"},{"id":"decisionTreesRegression","name":"Decision trees Regression (DTR)","information":"The decision tree is a greedy algorithm that performs a recursive binary partitioning of the feature space. Each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node. The impurity method used is the ‘variance’. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxDepth","kind":"Integer","information":"Maximum Depth","value":"5"},{"name":"maxBins","kind":"Integer","information":"maxBins","value":"32"},{"name":"minInstancesPerNode","kind":"Integer","information":"minInstancesPerNode","value":"1"},{"name":"minInfoGain","kind":"Float","information":"minInfoGain","value":"0.0"}],"paragraphId":"20180314-121851_102488565"},{"id":"decisionTreesClassifier","name":"Decision trees Classifier (DTC)","information":"The decision tree is a greedy algorithm that performs a recursive binary partitioning of the feature space. Each partition is chosen greedily by selecting the best split from a set of possible splits, in order to maximize the information gain at a tree node. The impurity method used is the ‘entropy’. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxDepth","kind":"Integer","information":"Maximum Depth","value":"5"},{"name":"maxBins","kind":"Integer","information":"maxBins","value":"32"},{"name":"minInstancesPerNode","kind":"Integer","information":"minInstancesPerNode","value":"1"},{"name":"minInfoGain","kind":"Float","information":"minInfoGain","value":"0.0"}],"paragraphId":"20180314-121852_1052604787"},{"id":"randomForestRegression","name":"Random Forest Regressor (RFR)","information":"Random forests construct a group of decision trees at training time and use the mean outcome as the product of the system. This method overcomes the overfitting issue of individual decision trees. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxDepth","kind":"Integer","information":"Maximum Depth","value":"5"},{"name":"maxBins","kind":"Integer","information":"maxBins","value":"32"},{"name":"minInstancesPerNode","kind":"Integer","information":"minInstancesPerNode","value":"1"},{"name":"minInfoGain","kind":"Float","information":"minInfoGain","value":"0.0"},{"name":"numTrees","kind":"Integer","information":"number of Trees","value":"10"}],"paragraphId":"20180828-122052_380805630"},{"id":"randomForestClassifier","name":"Random Forest Classifier (RFC)","information":"Random forests construct a group of decision trees at training time and use the mean outcome as the product of the system. This method overcomes the overfitting issue of individual decision trees. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxDepth","kind":"Integer","information":"Maximum Depth","value":"5"},{"name":"maxBins","kind":"Integer","information":"maxBins","value":"32"},{"name":"minInstancesPerNode","kind":"Integer","information":"minInstancesPerNode","value":"1"},{"name":"minInfoGain","kind":"Float","information":"minInfoGain","value":"0.0"},{"name":"numTrees","kind":"Integer","information":"number of Trees","value":"10"}],"paragraphId":"20180828-131249_1439234059"},{"id":"gradientBoostedTreeRegression","name":"Gradient-boosted tree Regression (GBTR)","information":"Another popular ensemble of decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxDepth","kind":"Integer","information":"Maximum Depth","value":"5"},{"name":"maxBins","kind":"Integer","information":"maxBins","value":"32"},{"name":"minInstancesPerNode","kind":"Integer","information":"minInstancesPerNode","value":"1"},{"name":"minInfoGain","kind":"Float","information":"minInfoGain","value":"0.0"}],"paragraphId":"20180828-125333_2013070982"},{"id":"gradientBoostedTreeClassifier","name":"Gradient-boosted tree Classifier (GBTC)","information":"Another popular ensemble of decision trees. It builds the model in a stage-wise fashion like other boosting methods do, and it generalizes them by allowing optimization of an arbitrary differentiable loss function (note: BINARY classification only available). Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxDepth","kind":"Integer","information":"Maximum Depth","value":"5"},{"name":"maxBins","kind":"Integer","information":"maxBins","value":"32"},{"name":"minInstancesPerNode","kind":"Integer","information":"minInstancesPerNode","value":"1"},{"name":"minInfoGain","kind":"Float","information":"minInfoGain","value":"0.0"}],"paragraphId":"20180828-131253_989652806"},{"id":"mlp","name":"Multi-layer Perceptron (MLP)","information":"Multilayer perceptron (MLP) classifier is based on the feedforward artificial neural network. MLP classifier consists of multiple layers of nodes fully interconnected with each other. Nodes in the input layer represent the input data. All other nodes map inputs to outputs by a linear combination of the inputs with the node’s weights and bias. Each layer has sigmoid activation function, output layer has softmax. Number of inputs has to be equal to the size of feature vectors. Number of outputs has to be equal to the total number of labels. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"maxIterations","kind":"Integer","information":"Max Iterations","value":"100"},{"name":"step","kind":"Float","information":"step","value":"0.1"},{"name":"convergenceTolerance","kind":"Float","information":"convergenceTolerance","value":"0.001"},{"name":"layers","kind":"Comma separated Integers","information":"example 33,72,25","value":""},{"name":"seed","kind":"Integer","information":"seed","value":""}],"paragraphId":"20180314-121854_1204567882"},{"id":"nb","name":"Naïve Bayes (NB)","information":"Naïve Bayes (NB) is a simple multiclass classification algorithm based on applying Bayes’ theorem with strong (naive) independence assumptions between the features. This version supports multinomial NB. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"-1"},{"name":"smoothing","kind":"Float","information":"smoothing","value":"1.0"}],"paragraphId":"20180314-121854_1598665997"}]}],"noteId":"2DNTQAG7C"},{"name":"errorMessage","object":"Job aborted due to stage failure: Task 0 in stage 785.0 failed 4 times, most recent failure: Lost task 0.3 in stage 785.0 (TID 800, bbc7-10G, executor 1): java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0. Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.\n\tat scala.Predef$.require(Predef.scala:224)\n\tat org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:153)\n\tat org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:151)\n\tat scala.collection.Iterator$$anon$11.next(Iterator.scala:409)\n\tat scala.collection.Iterator$$anon$11.next(Iterator.scala:409)\n\tat org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)\n\tat org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)\n\tat org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)\n\tat org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)\n\tat org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)\n\tat org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:285)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:287)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:287)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:108)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nDriver stacktrace:","noteId":"2DNTQAG7C"},{"name":"evaluationMethodFamily","object":{"methods":[{"name":"Split data by train-test ratio","value":"trainSplit","parameters":[{"name":"train","kind":"","information":"Train (%)","value":"50"},{"name":"test","kind":"","information":"Test (%)","value":"50"}]},{"name":"Split data by Cross Validation","value":"crossValidation","parameters":[{"name":"xCrossValidation","kind":"","information":"Set x for x-fold cross validation","value":"-1"}]}]},"noteId":"2DNTQAG7C"},{"name":"error","object":false,"noteId":"2DNTQAG7C"},{"name":"result2","object":"Root-mean-square-error (RMSE) = 1.2383282596429254E8","noteId":"2DNTQAG7C"},{"name":"arrayObjects","object":[],"noteId":"2DNTQAG7C"},{"name":"result1","object":"Coefficients: [2539.2670351519914], Intercept: 1.8575234235674053E7","noteId":"2DNTQAG7C"},{"name":"selectedAlgorithm","object":{"id":"linearRegression","name":"Linear regression (OLS)","information":"Ordinary Least squares (OLS) is the simplest and most common linear regressor. The learning objective of OLS is to minimize the sum of squared residuals, in order to estimate the coefficients of the linear regression expression. Executing the algorithm produces a result dataframe which is saved, along with the respective model, to the designated folder. <a target=\"_blank\" href=\"https://spark.apache.org/docs/2.2.0/ml-classification-regression.html\">More…</a>","parameters":[{"name":"labelCol","kind":"Integer","information":"Label Column (Index of Label column, a.k.a. output column, if any)","value":"2","$$hashKey":"object:2916"},{"name":"maxIterations","kind":"Integer","information":"Max Iterations","value":"10","$$hashKey":"object:2917"},{"name":"regularizationParam","kind":"Float","information":"regularizationParam","value":"0.3","$$hashKey":"object:2918"}],"paragraphId":"20180314-121848_2079236016","$$hashKey":"object:2894"},"noteId":"2DNTQAG7C","paragraphId":"20180221-124127_25791550"},{"name":"datasetSeparator","object":",","noteId":"2DNTQAG7C","paragraphId":"20180221-124127_25791550"},{"name":"datasetPath","object":"hdfs:///Projects/suite5/demodata/test111.csv","noteId":"2DNTQAG7C","paragraphId":"20180221-124127_25791550"},{"name":"selectedEvalMeth","object":{"name":"Split data by train-test ratio","value":"trainSplit","parameters":[{"name":"train","kind":"","information":"Train (%)","value":"80","$$hashKey":"object:2949"},{"name":"test","kind":"","information":"Test (%)","value":"20","$$hashKey":"object:2950"}],"$$hashKey":"object:2301"},"noteId":"2DNTQAG7C","paragraphId":"20180221-124127_25791550"},{"name":"datasetOutputPath","object":"hdfs:///Projects/suite5/demodata/","noteId":"2DNTQAG7C","paragraphId":"20180221-124127_25791550"}]},"config":{"looknfeel":"default","personalizedMode":"false"},"info":{}}