http://blog.sina.com.cn/s/blog_6bc5205e0102vma9.html
install.packages
1、從網絡上直接安裝
install.packages("包名字",lib="安裝目錄",repos="包所在的網址))。也可通過參數contriburl指定包所在的網址
例:
install.packages(“stepNorm”,contriburl=”http://www.your.url”,dependencies=TRUE)
2、本地安裝
1)install.packages("包文件的完整路徑“)
2)在命令行下(不是R窗口)直接輸入:R CMD INSTALL 包文件的完整路徑
installed.packages()# 查看已經安裝的包 配合install.packages() 寫一個循環可以批量安裝包?
update.packages()#更新已安裝的包
path.packages() #查看已經安裝了哪些包?
.libPaths()#查看包的安裝目錄
.libpaths('youlibray')#修改包的安裝目錄
.libpaths(c('newlibray','oldlibray')
library("your package", lib.loc="/yourlibrary/")#下載到臨時文件夾的調用方法
R.version 、R.version.string 查看當前版本
R幫助函數
help.start( )打開幫助文檔首頁
help()?
?
#讀取前五行數據
data <- read.table("datatable.txt", nrows = 5)
getwd();
setwd('D:\\RStudio\\www');
data2 <- read.csv('new.csv',encoding = 'UTF-8');
#查看第一列數
data2[,1];
#將第一列數改為分類結構
data2[,1] <- factor(data2[,1]);
data2[,1];
#統一映射為另外一個數據標簽
data2[,1] <- factor(data2[,1],labels = c('三年一班','三年二班','三年三班'));
data2[,1];;
data_1 <- read.csv('new_1.csv',encoding = 'utf8');
fix(data_1);
data_1 [,1] <- factor(data_1[,1],levels = c(1,2,3),labels = c('三年一班','三年二班','三年三班'));
data_1;
#轉換為字符串的向量
as.vector(data_1[,1]);
data_1;
#轉換為數字向量
as.numeric(data_1[,1]);
#連續變量的離散化
score <- data_1[,3];
score1 <- cut(score,breaks = 3);
score1;
#切分成自己設置的組
score2 <- cut(score,breaks = c(79,100,120,161));
score2;
#一個有序因子
score3 <- ordered(score1,labels=c('bad','ok','good'));
score3;
table(score3);
#可排序的離散分類結構
datax <- read.csv('new.csv',encoding = 'UTF-8');
datax[,1] <-?
? ordered(datax[,1],
? levels <- c(1,3,2),
? labels <- c('一班','三班','二班')
);
table(datax[,1]);
datax[order(datax[,1]),];
datax[order(datax[,3]),];
#list 的創建方式?
#無tag試
j <- list('a',500,T);
#有tag方式
y <- list(name='fudegang',salary=10000,union=T);
#list的訪問方式
#1list_name$tag_name
y$name;
y$salary;
#2list_name[[tga]]
y[['name']];
#3list_name[[index]]
y[[1]];
#以數組的形式訪問返回的是整個list
y[];
y[1]$name;
#訪問標簽
labels(y);
labels(y[1]);
#list 一次只能訪問一個數據
list[1:2];
#list的修改?
#增加
y$sex <- '男';
y[5] <- 170;
y[3];
#修改
y$sex <- '女';
y;
#刪除
y$sex <- NULL;
y;
#查
y == 'fudegang';
#查看長度
length(y);
#dataframe
#dataframe 定義
name <- c('張三','李四','王五');
age <-c(23,33,56);
df <- data.frame(name,age);
df;
#修改列名
colnames(df);
names(df);
names(df) <- c('name2','age2');
colnames(df);
#修改一個列名
names(df)[2] <- 'age3';
df;
names(df)[names(df) == 'age3'] <-'age4';
df;
#修改行名?
row.names(df);
row.names(df) <-0:2;
df;
#刪除行
df1 <- df[-1,];
df1;
df2 <-df[-2,];
df2;
#增加行
df[,'sex'] <- c(0,1,1);
df;
#3.1 數據清洗
#重復值處理 unique()?
dd <- read.csv('1.csv',encoding = 'UTF-8');
dd;
new_dd <- unique(dd);
new_dd;
#缺失值的處理(補齊,刪除,不處理)
#刪除行 na.omit()
dd_1 <- read.csv('2.csv',encoding = 'UTF-8');
dd_1;
new_dd_1 <- na.omit(dd_1);
new_dd_1;
#清洗空格 trim() install.package(raster) library(raster)
dd_3 <- read.csv('3.csv');
dd_3;
install.packages('raster',lib = .libPaths());
path.package();
.libPaths();
path.package();
path.package();
install.packages('raster');
library('raster');
dd_3 <- read.csv('3.csv');
dd_3;
getwd();
setwd('D:\\RStudio\\www');
dd_3 <- read.csv('3.csv');
new_dd_3 <- trim(dd_3);
View(new_dd_3);
#3.2數據抽取
#字段抽取substr(x,start,stop)
tel <- '13811568128';
band <- substr(tel,1,3);
band;
area <- substr(tel,4,7);
area;
num <- substr(tel,8,11);
num;
getwd();
tels <- read.csv('1.csv');
fix(tels);
bands <- substr(tels[,1],1,3);
bands;
areas <- substr(tels[,1],4,7);
areas;
nums <- substr(tels[,1],8,11);
nums;
num_tels <-data.frame(tels,bands,areas,nums);
fix(num_tels);
#字段的拆分 str_split_fixed(x,split,n) 類似excel的分列功能
.libPaths();
install.packages('stringr',lib = .libPaths());
library(stringr);
items <- read.csv('2.csv',encoding = 'UTF-8',stringsAsFactors = FALSE);
fix(items);
new_bands <- str_split_fixed(items[,1],' ',n = 2 );
new_bands;
fix(new_bands);
new_items <- data.frame(new_bands[,1],items);
fix(new_items);
names(new_items) <- c('band','item');
colnames(new_items);
fix(new_items);
#數據的抽取 subset(x,condition) 類似excel的過濾功能?
getwd();
item3 <- read.table('3.csv',header = TRUE ,sep = "|",fileEncoding = 'utf-8',stringsAsFactors = FALSE);
fix(item3);
sub_item3 <- subset(item3,comments>100);
fix(sub_item3);
#3.3 數據合并
#記錄合并 rbind(dataframe1,dataframe2,...)
data1 <- read.table('1_1.csv',sep = "|", header = TRUE, fileEncoding = 'utf-8', stringsAsFactors = F);
data2 <- read.table('1_2.csv', sep = '|', header = T, fileEncoding = 'utf-8', stringsAsFactors = F);
data3 <- read.table('1_3.csv', sep = '|', header = T, fileEncoding = 'utf-8', stringsAsFactors = F);
datar <- rbind(data1,data2,data3);
fix(datar);
#subset?
datab <- subset(datar,datar[,2]>10000);
fix(datab);
#字段的合并paste(x,x1,x2)
data4 <- read.table('2.csv',sep = ' ');
fix(data4);
data5 <- paste(data4[,1],data4[,2],data4[,3], sep = '');
new_data5 <- data.frame(data4,data5);
fix(new_data5);
#字段匹配類似excel的vlookup merge(x,y,by.x=c(),by.y=())
items6 <- read.table('3_1.csv',sep = '|', header = F, fileEncoding = 'utf-8');
fix(items6);
price <- read.table('3_2.csv',sep = '|', header = F, fileEncoding = 'utf-8');
colnames(items6);
itmesprice <- merge(price, items6, by.x=c('V1'), by.y=c('V1'));
itmesprice <- merge(items6,price, by.x=c(names(items6)[1]), by.y=c('V1'));
fix(itmesprice);
#3.4 簡單計算
getwd();
dada <- read.csv('1.csv',fileEncoding = 'utf-8',header = T, stringsAsFactors = F, sep = '|');
colnames(dada);
cost <- dada$price*dada$num;
new_dada <- data.frame(dada,cost);
fix(new_dada);
#數據標準化 一般指01標準化?
dada2 <- read.csv('2.csv',fileEncoding = 'utf-8');
View(dada2);
colnames(dada2);
scale <- (dada2$score-min(dada2$score))/(max(dada2$score)-min(dada2$score));
new_dada2 <- data.frame(dada2,scale);
fix(new_dada2);
#數據分組
cc <- read.csv('3.csv',header = T,sep = '|',fileEncoding = 'utf-8');
options(digits = 15);
cc;
fix(cc);
level <- ifelse(
? cc$cost <=20,'(0,20)',
? ifelse(
? ? cc$cost <= 40,'(20,40)',
? ? ifelse(
? ? ? cc$cost <= 60 , '(40,60)',
? ? ? ifelse(
? ? ? ? cc$cost <= 80,'(60,80)','(80-以上'
? ? ? )
? ? )
? )
);
level;
cc1 <- data.frame(cc,level);
fix(cc1);
#3.5 日期處理
#日期轉換posixit
strdata <- '2016-4-28';
posixlt <- as.POSIXlt(strdata,format = '%Y-%m-%d');
posixlt;
strdata2 <- '2016/4/29';
posixlt <- as.POSIXlt(strdata2, format = '%Y/%m/%d');
posixlt;
#日期格式化
newstrdata <- format(posixlt,format = '%Y-%m-%d');
newstrdata;
#日期抽取
xxx <- read.csv('1.csv',header = T);
fix(xxx);
pos <- as.POSIXlt(xxx$注冊時間 ,format = '%Y-%m-%d');
fix(www);
yeas <- www$year +1900 ;
mon <- www$mon + 1;
newwww <- data.frame(www,yeas,mon);
View(newwww);
#4.1數據分析
#基本統計 計數 求合 平均值 ?summary( ) length sum mean var sd
getwd();
.libPaths();
setwd('D:\\RStudio\\www');
getwd();
ali <- read.csv('1.csv',fileEncoding = 'utf-8');
ali;
summary(ali$score);
#計數
length(ali$score);
#求平均值
mean(ali$score);
#最大值
max(ali$score);
#最小值
min(ali$score);
#方差
var(ali$score);
#標準差
sd(ali$score);
#求合
sum(ali$score);
#4.2 分組分析 相當于excel的數據透視表 aggregate(統計量~ )
aggregate(ali$name~ali$class,data = ali,FUN = length);
aggregate(ali$score~ali$class,data = ali,FUN = sum);
aggregate(ali$score~ali$class,data = ali,FUN = mean);
colnames(ali);
#4.3 交叉分析tapply(統計量,list(縱軸行,橫軸列),FUN=統計函數) 數據透視表
用戶明細 <- read.csv('用戶明細.csv',stringsAsFactors = F);
fix(用戶明細);
年齡分組 <- ifelse(
? user$年齡 <= 20 ,'20歲及20歲以下',
? ifelse(
? ? 用戶明細$年齡 <= 30 ,'21歲至29歲','30歲及以上'
? )
)
colnames(用戶明細);
fix(年齡分組);
用戶明細 <- data.frame(用戶明細,年齡分組);
fix(用戶明細);
tapply(用戶明細$用戶ID, list(用戶明細$年齡分組,用戶明細$性別),FUN = length);
#結構分析 ?prop.table()
getwd();
setwd('D:\\RStudio\\www');
bibi <- read.csv('5.csv',stringsAsFactors = F,fileEncoding = 'utf-8');
fix(bibi);
colnames(bibi);
bibi1 <- tapply(bibi$月消費.元., list(bibi$通信品牌), length);
bibi1;
prop.table(bibi1);
bibi1 <- tapply(bibi$月消費.元., list(bibi$省份, bibi$通信品牌), length);
bibi1;
prop.table(bibi1,margin = 1);#百分比顯示
#5.1數據可視化?
#餅圖 pie?
bibi1 <- tapply(bibi$月消費.元., list(bibi$通信品牌), length);
bibi1;
p <- prop.table(bibi1);
label <- paste(names(p),round(p*100,2) ,'%',sep = '');
pie(bibi1,label=label,main = '通信品牌用戶結構圖');
#散點圖 plot(x,y,main,sub,xlab,ylab,col)
datam <- read.csv('data.csv',header= T);
colnames(datam);
plot(
? datam$廣告費用,
? datam$購買用戶數,
? main = '相關分析',
? sub = '廣告費用和用戶數之間的關系',
? xlab = '廣告費用',
? ylab = '購買用戶數',
? col = 'red'
);
#折線圖plot(x,y,main,sub,xlab,ylab,col,type)
ds <- as.POSIXlt(datam[,1]);
year <- ds$year+1900;
moth <- ?ds$mon+1;
yearm <- paste(year,'年',moth,'月',sep = '');
plot(
? yearm,
? datam$購買用戶數,
? main = '相關分析',
? sub = '廣告費用和用戶數之間的關系',
? xlab = '廣告費用',
? ylab = '購買用戶數',
? col = 'red'
);
#地圖地址函數 map(database,fill = F,col) 地圖標注函數 text(x,y,text,cex) cex 字體的放大縮小
library(maps);
install.packages(maps);
install.packages('maps');
.libPaths("d:/R/R-3.2.3/library");
.libPaths();
install.packages('maps',lib='d:/R/R-3.2.3/library');
installed.packages();
path.package();
library(maps);
library(mapdata);
install.packages('mapdata',lib = 'd:/R/R-3.2.3/library' );
library('mapdata');
m <- map('state',fill = F);
m$names;
c <- map('china',fill = F);
# 準備地圖數據
install.packages('maptools');
library(maps);
library(mapdata);
library(maptools);
china_map <- readShapePoly('bou2_4p.shp');# 讀取地圖空間數據
plot(china_map);
install.packages('ggplot2');
library(ggplot2);
# 用ggplot繪制
install.packages('mapproj');
ggplot(china_map,aes(x=long,y=lat,group=group)) +
? geom_polygon(fill="white",colour="grey") +
? coord_map("polyconic");
x <- china_map@data ;#讀取行政信息
xs <- data.frame(x,id=seq(0:924)-1); #含島嶼共925個形狀
china_map1 <- fortify(china_map);#轉化為數據框
library(plyr);
china_map_data <- join(china_map1, xs, type = "full"); ? ? ? #合并兩個數據框 提示:Joining by: id
# 準備業務數據
NAME <- unique(china_map@data$NAME);
mydata1 <- read.csv('www.csv');#讀取省份數據
mydata <- data.frame(NAME,mydata1)
ccc <- runif(34,min=1,max=100) ;
mydata <- data.frame(mydata,ccc);
china_data <- join(china_map_data, mydata, type="full") ; ? ? ? ? #合并兩個數據框 提示Joining by: NAME
# 繪制地圖
# 現在可以開始試試畫填色地圖了
ggplot(china_data, aes(x = long, y = lat, group = group, fill = ccc)) +
? geom_polygon(colour="grey40")+
? scale_fill_gradient(low="white",high="steelblue") + ?#指定漸變填充色,可使用RGB
? coord_map("polyconic") ? ? ? ?#指定投影方式為polyconic,獲得常見視角中國地圖
#利用sheme 函數清除不必要元素
ggplot(china_data, aes(x = long, y = lat, group = group,fill = ccc)) +
? geom_polygon(colour="grey40") +
? scale_fill_gradient(low="white",high="steelblue") + ?#指定漸變填充色,可使用RGB
? coord_map("polyconic") + ? ? ? #指定投影方式為polyconic,獲得常見視角中國地圖
? theme( ? ? ? ? ? ? ? #清除不需要的元素
? ? panel.grid = element_blank(),
? ? panel.background = element_blank(),
? ? axis.text = element_blank(),
? ? axis.ticks = elemen
? ? t_blank(),
? ? axis.title = element_blank(),
? ? legend.position = c(0.2,0.3)
? )
#導出文件 write.table(x,file = '',sep = '',row.names = T,col.names = T,quote = T(string是否用字符擴起來))
#sep(from,to,by,length.out = 最大長度)生成任意步長的數例?
#rep(x,times) ?生成任意次數的重復向量
用'demo()'來看一些示范程序,用'help()'來閱讀在線幫助文件,或
用'help.start()'通過HTML瀏覽器來看幫助文件。
用'q()'退出R.
變量 命令 參數設置工作空間
> x <- 10;
> y <- x/7;
> y
[1] 1.428571
> options(digits=10)
> y
[1] 1.428571429
> options(digits=20)
> y
[1] 1.4285714285714286
向量 列表框
vector frame?
vector 定義: c() 限制:行列的數據要一樣 訪問:f[]
frame 定義: data.frame() 限制:列的數據要一樣 訪問:f[] ??
fix() ?可視化列表框
read.csv("first.csv")
read.table("first.txt",header = TRUE ,seq = "\t" ?)
read.excel()
read.excel2007()
RODBC?
odbcconnectexcel()
install.packages("RODBC")
libary(RODBC)?
s = odbcconnectexcel("first.xls")
sqlfecth(s,sheet1)
無法安裝rodbc?
可以試下執行:Sys.setlocale(category = "LC_ALL", locale = "us")
win+r 運行lusrmgr.msc 修改用戶名
數據的導出:
數據清洗
去重
bbc <-read.csv('1.csv',encoding = 'UTF-8');encoding = 'utf-8")
bbc <- unique(bbc)
na.omit()
去掉空值
str_split_fixed(x,split,n)
安裝包,指定安裝包的路徑
綃卸安裝包
>