知识库 : 新浪微博跑男数据处理(R)

Edit Document

wb.data <- read.csv("d:/data/wb_big/wbnew.csv");

//

index <- 1;

result.uid <- c(0);

result.mid <- c(0);

data <- wb.data;

length <- length(data[,1])

for(i in 1:length){

print(i);

temp <- as.character(data[i,3]);

temp <- strsplit(temp,split = ",")[[1]];

temp.length <- length(temp);

for(j in 1:temp.length){

result.uid[index] <- data[i,2];

result.mid[index] <- as.character(temp[j]);

index <- index + 1;

}

result <- data.frame("uid"=result.uid,"mid" = result.mid);

getMenction <- function(mention.size,data){

temp <- aggregate(result$uid,by = list("uid" = result$uid,"mid" = result$mid),length);

t <- temp[order(temp$x,decreasing = TRUE),];

colnames(t)[3] <- "weight";

t <- t[t$weight >= mention.size,];

}

wbStat <- function(max.size,data){

t <- data[data$weight >= max.size,];

node.v <- c(unique(t$mid),unique(t$uid));

node.v <- unique(node.v);

node <- data.frame("ID"=1:length(node.v),"UID"=node.v,"NAME"="","VALUE"=0,"CATEGORY"=3);

node$NAME <- as.character(node$NAME);

node[node$UID == "5242381821",5] = 1;

node[node$UID == "5242381821",3] = "奔跑吧兄弟官微";

node[node$UID == "5187664653",5] = 2;

node[node$UID == "5187664653",3] = "邓超";

node[node$UID == "1259193624",5] = 2;

node[node$UID == "1259193624",3] = "李晨";

node[node$UID == "1642351362",5] = 2;

node[node$UID == "1642351362",3] = "Baby";

node[node$UID == "1574684061",5] = 2;

node[node$UID == "1574684061",3] = "陈赫";

node[node$UID == "1275280670",5] = 2;

node[node$UID == "1275280670",3] = "郑凯";

node[node$UID == "1730330447",5] = 2;

node[node$UID == "1730330447",3] = "王祖蓝";

node[node$UID == "1426725707",5] = 2;

node[node$UID == "1426725707",3] = "包贝尔";

node[node$UID == "1254123322",5] = 2;

node[node$UID == "1254123322",3] = "王宝强";

write.csv(node,"d:/wb/node.csv",fileEncoding="UTF-8");

link <- merge(t,node,by.x = "mid",by.y = "UID",incomparables = NA);

link <- link[,1:4];

colnames(link)[4] <- "target"

link <- merge(link,node,by.x = "uid",by.y = "UID",incomparables = NA);

link <- link[,1:5];

colnames(link)[5] <- "source"

write.csv(link,"d:/wb/link.csv",fileEncoding="UTF-8");

category <- data.frame("INDEX"=c(1,2,3),"NAME"=c("跑男官微","跑男成员","其他人"),"KEYWORD"=c("跑男官微","跑男成员","其他人"),"BASE"=c("","",""))

write.csv(category,"d:/wb/category.csv",fileEncoding="UTF-8");

}

// 微博树形结构处理

man <- c("5242381821","5187664653","1259193624","1642351362","1574684061","1275280670","1730330447","1426725707","1254123322");

tree.data <- data.frame("parent.uid"=c("","5242381821","5242381821","5242381821","5242381821","5242381821","5242381821","5242381821","5242381821"),"uid"=man,"parent.id"="","id"="","weight"=0);

t <- temp[(temp$uid %in% man),];

t <- t[(t$mid %in% man),];

tree.data$uid <- man;

for(i in 2:9){

tree.data[tree.data$uid==man[i],]$weight = t[t$uid == "5242381821" & t$mid==man[i],]$weight + t[t$mid == "5242381821" & t$uid==man[i],]$weight;

}

man.2 <-man[2:9];

t.2 <- temp[(temp$uid %in% man.2),];

t.2.mid <- temp[(temp$mid %in% man.2),];

t.2 <- rbind(t.2,t.2.mid)

t.2 <- unique(t.2)

t.2 <- t.2[t.2$uid != "5242381821",]

t.2 <- t.2[t.2$mid != "5242381821",]

t.2 <- t.2[t.2$mid != t.2$uid,]

data3 <- c(unique(t.2$mid),unique(t.2$uid))

data3 <- unique(data3)

data3 <- data3[-(data3 %in% man.2)]

知识库 : 新浪微博跑男数据处理(R)

Attachments: