diff --git a/a4.r b/a4.r index a50a9d1120ea8f2863e9f08e94c530620692d8e2..f459ae41f3a6296e0a37a7adbfdf92c65d611abe 100644 --- a/a4.r +++ b/a4.r @@ -1,5 +1,9 @@ ### # R code for assignment 4. + # + # The "rpart" library seems to create a Regression Tree for the given dataset. + # A regression tree is a Decision Tree that attempts to predict continuous values (numbers) rather than predicting + # standard classification labels. ## @@ -8,8 +12,8 @@ cat('\014') # Install and import packages. -install.packages('rpart', dependencies=TRUE) -install.packages('randomForest', dependencies=TRUE) +# install.packages('rpart', dependencies=TRUE) +# install.packages('randomForest', dependencies=TRUE) library(rpart) library (randomForest) @@ -28,3 +32,149 @@ catn <- function(message, trailing='\n') { cat(trailing) } + +### + # View attributes and data from the "iris" dataset. + # Appears to be built into R. + ## +view_iris_dataset <- function() { + + # General structure info. + catn('') + catn('str(iris):') + print(str(iris)) + catn('') + + # First 6 items in dataset. + catn('head(iris):') + print(head(iris)) + catn('') + + # Last 6 items in dataset. + catn('tail(iris):') + print(tail(iris)) + catn('') + + # Summary of dataset. + catn('summary(iris):') + print(summary(iris)) + catn('') + +} + + +### + # This seems to be the simplest possible code to create a regression tree and print out result to a table. + # + # The first line creates the Regression Tree. The first arg is the "formula" and the second is the dataset to use. + # The second line....doesn't do anything? We don't plot anything and commenting out doesn't change output, strange. + # The third line creates the table that actually displays. + # + # Note that we essentially give a static dataset. Thus, our output is always the same, every time we run the code. + ## +part_1_1 <- function() { + catn('') + catn('Part 1.1:') + + # Provided assignment code. + dt=rpart(Species~.,iris) + plot(dt);text(dt); + print(table(predict(dt,type="class"),iris$Species)) + + catn('') +} + + +### + # Very similar to Part 1.1 code. + # + # The only difference is that the first line now provides an additional argument, which is the "control". + # According to the docs (see readme.md), this control value specifies "various parameters that control aspects of the + # rpart fit". + # + # In this case, we give "cp" and "minsplit" values of 0. According to the docs: + # * minsplit - The minimum observations required to attempt a decision branch split. + # * cp - Specifies "complexity". + # + # So going based off an educated guess, proving a value of 0 for these (like we do in this code) reduces overall tree + # complexity, resulting in very coarse Decision Tree rules. The result is extremes of 0 or 50, like we see in the + # output. + ## +part_1_2 <- function() { + catn('') + catn('Part 1.2:') + + # Provided assignment code. + dt=rpart(Species~.,iris,control=rpart.control(cp=0.0,minsplit=0)) + plot(dt);text(dt); + print(table(predict(dt,type="class"),iris$Species)) + + catn('') +} + + +### + # This code is quite a bit different from the above two parts. + # + # It looks like the first three lines effectively partition our dataset into subsets, such that we have a training set + # and a testing set. + # + # More specifically, we take a sample of 30 random indexes out of 50 total possible indexes. Then we offset for the + # given desired coordinate dimentions. This set of indexes is then used to determine our two subsets, where the test + # set uses all indexes that were not chosen for the training set. + # + # Finally, the remaining three lines of code are mostly the same as code in Parts 1.1 and 1.2, with the main difference + # being that we use our subsets for generation and display. This results in final results that vary on every run, + # because our original sample populations vary on every run. + # + # Note that, because we take 30 values for our training set and 20 for our testing set, this only outputs 20 total + # values for our final output. + ## +part_1_3 <- function() { + catn('') + catn('Part 1.3:') + + # Provided assignment code. + train_index = c(sample(50,30), sample(50,30)+50, sample(50,30)+100) + iris_train=iris[train_index,] + iris_test=iris[-train_index,] + dt=rpart(Species~.,iris_train) + plot(dt);text(dt); + print(table(predict(dt,newdata=iris_test,type="class"),iris_test$Species)) + + catn('') +} + + +### + # This section of code accomplilshes similarly to above parts, except it uses the "randomForest" library to do so. + # + # For the first line, we create our forest structure. The arguments are as follows: + # * arg1 - Once again, this is our "formula". I'm still not sure what it is we're passing in, though. + # * arg2 - The full iris dataset. + # * ntree - The number of trees to grow in our forest. Note that our output is somewhat random, but generally very, + # very consistent with minimal variation. I assume this is due to using 1000 trees with such a small dataset. + # * proximity - "Should proximity measure among the rows be calculated?" ??? From docs. No idea what this means. + # + # The second line once again creates a table of values from our function and displays it for us to see. + ## +part_1_4 <- function() { + catn('') + catn('Part 1.4:') + + # Provided assignment code. + rf=randomForest(Species~., iris, ntree=1000, proximity=TRUE) + print(table(predict(rf,type="class"),iris$Species)) + + catn('') +} + + +# View dataset. +view_iris_dataset() + +# View code parts, provided by assignment description file. +part_1_1() +part_1_2() +part_1_3() +part_1_4() diff --git a/documents/references.md b/documents/references.md index 33b73923cc9ff19abe3892ef3f3297c6353eec32..b0821eb896664c9b36cfc63144f48c6a07188375 100644 --- a/documents/references.md +++ b/documents/references.md @@ -5,8 +5,24 @@ All references to external logic. Includes anything from stack overflow links to notes about logic from previous works. +## General + +### Decision Trees +<https://en.wikipedia.org/wiki/Decision_tree_learning> + + ## R +### Rpart Documentation +<https://www.rdocumentation.org/packages/rpart/versions/4.1-15/topics/rpart> +<https://www.rdocumentation.org/packages/rpart/versions/4.1-15/topics/rpart.control> + +### Sample Documentation +<https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/sample> + +### Random Forest Documentation +<https://www.rdocumentation.org/packages/randomForest/versions/4.6-14/topics/randomForest> + ### Alternative to the "Print" Function Because it doesn't always behave how I expect, and tends to give additional, "dirty" output that I don't want. <https://stackoverflow.com/questions/11230957/alternative-to-print-and-cat>