資源簡介
資源含Java實現的C4.5決策樹代碼,以及測試數據。實現了C4.5的絕大部分功能,但關于連續變量和確實變量并未詳加討論。但是相關的函數已經提供,讀者可以很容易借此實現。

代碼片段和文件信息
import?java.io.BufferedReader;
import?java.io.File;
import?java.io.FileReader;
import?java.io.IOException;
import?java.util.ArrayList;
import?java.util.linkedList;
import?java.util.Map;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;
import?java.io.FileOutputStream;
import?java.io.BufferedOutputStream;
import?java.lang.Math.*;
public?class?DecisionTree?{
????//?同時保留訓練集和測試集的數據在模型中,防止訓練集和測試集的列順序不同
????private?ArrayList?train_AttributeName?=?new?ArrayList();?//?存儲訓練集屬性的名稱
????private?ArrayList>?train_attributeValue?=?new?ArrayList>();?//?存儲訓練集每個屬性的取值
????private?ArrayList?predictAttribute?=?new?ArrayList();?//?存儲測試集屬性的名稱
????private?ArrayList>?predict_attributeValue?=?new?ArrayList>();?//?存儲測試集每個屬性的取值
????private?ArrayList?trainData?=?new?ArrayList();?//?訓練集數據?,即arff文件中的data字符串
????private?ArrayList?predictData?=?new?ArrayList();?//?測試集數據
????public?static?final?String?patternString?=?“@attribute(.*)[{](.*?)[}]“;
????//正則表達,其中*??表示重復任意次,但盡可能少重復,防止匹配到更后面的“}“符號
????private?int?decatt;?//?決策變量在屬性集中的索引(即類標所在列)
????private?InfoGain?infoGain;
????private?TreeNode?root;
????public?void?train(String?data_path?String?targetAttr){
????????//模型初始化操作
????????read_trainARFF(new?File(data_path));
????????//printData();
????????setDec(targetAttr);
????????infoGain=new?InfoGain(trainData?decatt);
????????//拼裝行與列
????????linkedList?ll=new?linkedList();?//linkList用于增刪比ArrayList有優勢
????????for(int?i?=?0;?i????????????if(i!=decatt)?ll.add(i);??//防止類別變量不在最后一列發生錯誤
????????}
????????ArrayList?al=new?ArrayList();
????????for(int?i=0;i ????????????al.add(i);
????????}
????????//構建決策樹
????????root?=?buildDT(“root“?“null“?al?ll);
????????//剪枝
????????cutBranch(root);
????}
????/**
?????*?構建決策樹
?????*?@param?fatherName?節點名稱
?????*?@param?fatherValue?節點值
?????*?@param?subset?數據行子集
?????*?@param?subset?數據列子集
?????*?@return?返回根節點
?????*/
????public?TreeNode?buildDT(String?fatherName?String?fatherValue?ArrayList?subsetlinkedList?selatt){
????????TreeNode?node=new?TreeNode();
????????Map?targetNum?=?infoGain.get_AttributeNum(subsetdecatt);//計算類-頻率
????????String?targetValue=infoGain.get_targetValue(targetNum);//判定分類
????????node.setTargetNum(targetNum);
????????node.setAttributeName(fatherName);
????????node.setAttributeValue(fatherValue);
????????node.setTargetValue(targetValue);
????????//終止條件為類標單一/樹深度達到特征長度(還有可能是信息增益率不存在)
????????if?(infoGain.isPure(targetNum)?|?selatt.isEmpty()?)?{
????????????node.setNodeType(“leafNode“);
????????????return?node;
????????}
????????int?maxIndex?=?infoGain.getGainRatioMax(subsetselatt);
????????selatt.remove(new?Integer(maxIndex));??//這樣可以remove?object
???
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件?????????366??2018-02-27?14:10??files\Tree.xm
?????文件?????????599??2018-02-24?16:02??files\train.arff
?????目錄???????????0??2018-02-27?14:42??src\
?????文件???????11311??2018-02-27?14:41??src\DecisionTree.java
?????文件????????4689??2018-02-27?14:18??src\InfoGain.java
?????文件????????1806??2018-02-24?15:48??src\MathUtils.java
?????文件????????1793??2018-02-27?14:42??src\TreeNode.java
?????目錄???????????0??2018-02-27?14:10??files\
- 上一篇:用Jena解析owl
- 下一篇:ID3決策樹含預測函數
評論
共有 條評論