通用文件处理:
import numpy as np//文件名和文件中每行的分隔符def loadDataSet(fileName,dotSplit): numFeat = len(open(fileName).readline().split(dotSplit)) dataMat = []; labelMat = [] fr = open(fileName)//该数据集默认是最后一列是因变量 for line in fr.readlines(): lineArr=[] curline = line.split(dotSplit) for i in range(0,numFeat-1): lineArr.append(float(curline[i])) dataMat.append(lineArr) labelMat.append(float(curline[numFeat-1])) xMat = np.mat(dataMat) yMat = np.mat(labelMat).T return xMat,yMat
这里是处理岭回归的实现:
import numpy as npdef ridgeRegres(xMat,yMat,lam=0.2): xTx = xMat.T*xMat denom = xTx + np.eye(np.shape(xMat)[1])*lam print np.shape(xMat)[0] if np.linalg.det(denom) == 0.0: print "wrong" return ws = denom.I*(xMat.T*yMat) return wsdef normalizing(xMat,yMat): yMean = np.mean(yMat,0) y = yMat-yMean xMeans = np.mean(xMat,0) xVar = np.var(xMat,0) x = (xMat-xMeans)/xVar return x,ydef ridgeTest(xM,yM): xMat,yMat = normalizing(xM,yM) numTestPts = 30 wMat = np.zeros((numTestPts,np.shape(xMat)[1])) print wMat for i in range(numTestPts): ws = ridgeRegres(xMat,yMat,np.exp(i-10)) wMat[i,:] = ws.T return wMat
向前逐步回归:
import numpy as npdef rssError(yArr,yHatArr): return ((yArr-yHatArr)**2).sum()def stageWise(xM,yM,eps=0.01,numIt=100): m,n = np.shape(xM) returnMat = np.zeros((numIt,n)) ws = np.zeros((n,1));wsTest = ws.copy();wsMax = ws.copy() lowestError = 0 for i in range(numIt): print ws.T for j in range(n): for sign in [-1,1]: wsTest = ws.copy() wsTest[j] += eps*sign yTest = xM*wsTest rssE = rssError(yM.A,yTest.A) if i == 0: lowestError = rssE if rssE < lowestError: lowestError = rssE print lowestError wsMax = wsTest ws = wsMax.copy() returnMat[i,:] = ws.T return returnMat