blob: b8e80b9b2280516fb8b5073ba49616967275cf77 [file] [log] [blame]
library(stringr)
library(xtable)
library(plyr)
library(ggplot2)
stringToDF <- function(x, ncol){
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = ncol, byrow = TRUE)
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
names(df) <- m[1, ]
df
}
##print(xtable(stringToDF(x, 3)), include.rownames = FALSE)
stringToDF2 <- function(x, query){
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE)
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
names(df) <- m[1, ]
df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", ""))
df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", ""))
df2 <- ldply(1:nrow(df), function(i) gsub("^\\s+|\\s+$", "", str_split(df[i, 1], ",")[[1]]))
names(df2) <- c("cores", "nodes", "configuration")
df2$query <- query
cbind(df, df2)
}
x2 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
26,610,386,635
17,740,258
15-core, 75 nodes, mmap
25,224,873,928
22,422,110
15-core, 50 nodes, mmap
20,387,152,160
27,182,870
15-core, 25 nodes, mmap
11,910,388,894
31,761,037
4-core, 131 nodes, in-memory
10,008,730,163
19,100,630
4-core, 131 nodes, mmap
10,129,695,120
19,331,479
4-core, 50 nodes, mmap
6,626,570,688
33,132,853"
x3 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
16,223,081,703
10,815,388
15-core, 75 nodes, mmap
9,860,968,285
8,765,305
15-core, 50 nodes, mmap
8,093,611,909
10,791,483
15-core, 25 nodes, mmap
4,126,502,352
11,004,006
4-core, 131 nodes, in-memory
5,755,274,389
10,983,348
4-core, 131 nodes, mmap
5,032,185,657
9,603,408
4-core, 50 nodes, mmap
1,720,238,609
8,601,193"
x4 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
7,591,604,822
5,061,070
15-core, 75 nodes, mmap
4,319,179,995
3,839,271
15-core, 50 nodes, mmap
3,406,554,102
4,542,072
15-core, 25 nodes, mmap
1,826,451,888
4,870,538
4-core, 131 nodes, in-memory
1,936,648,601
3,695,894
4-core, 131 nodes, mmap
2,210,367,152
4,218,258
4-core, 50 nodes, mmap
1,002,291,562
5,011,458"
x5 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
10,241,183,745
6,827,456
15-core, 75 nodes, mmap
4,891,097,559
4,347,642
15-core, 50 nodes, mmap
3,616,707,511
4,822,277
15-core, 25 nodes, mmap
1,665,053,263
4,440,142
4-core, 131 nodes, in-memory
4,388,159,569
8,374,350
4-core, 131 nodes, mmap
2,444,344,232
4,664,779
4-core, 50 nodes, mmap
1,215,737,558
6,078,688"
x6 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
7,309,984,688
4,873,323
15-core, 75 nodes, mmap
3,333,628,777
2,963,226
15-core, 50 nodes, mmap
2,555,300,237
3,407,067
15-core, 25 nodes, mmap
1,384,674,717
3,692,466
4-core, 131 nodes, in-memory
3,237,907,984
6,179,214
4-core, 131 nodes, mmap
1,740,481,380
3,321,529
4-core, 50 nodes, mmap
863,170,420
4,315,852"
x7 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
4,064,424,274
2,709,616
15-core, 75 nodes, mmap
2,014,067,386
1,790,282
15-core, 50 nodes, mmap
1,499,452,617
1,999,270
15-core, 25 nodes, mmap
810,143,518
2,160,383
4-core, 131 nodes, in-memory
1,670,214,695
3,187,433
4-core, 131 nodes, mmap
1,116,635,690
2,130,984
4-core, 50 nodes, mmap
531,389,163
2,656,946"
dat <- ldply(2:7, function(i){
df <- eval(parse(text = paste("x", i, sep = "")))
stringToDF2(df, paste("Query", i - 1, sep = " "))
})
ggplot(data = dat, aes(
x = as.numeric(str_extract(nodes, "[0-9]+")),
y = `Cluster scan rate (rows/sec)` / 1e9,
shape = configuration,
color = query,
size = factor(cores, levels = c("4-core", "15-core"))
)) +
scale_size_manual(values = c(3, 6), guide = guide_legend(title = "Cores Per Node")) +
scale_color_discrete(guide = guide_legend(title = "Query")) +
scale_shape_discrete(guide = guide_legend(title = "Configuration")) +
geom_point() +
scale_y_log10(breaks = 2^(-1:5)) +
facet_grid(configuration~.) +
xlab("Number of Nodes") +
ylab("Cluster Scan Rate (billion rows/sec.)")
dat2 <- subset(dat, cores == "15-core" & configuration == "mmap")
dat2$x <- as.numeric(str_extract(dat2$nodes, "[0-9]+"))
baseRate <- list()
d_ply(subset(dat2, x == 25), .(query), function(df) baseRate[df$query] <<- df$`Cluster scan rate (rows/sec)`)
dat2 <- ddply(dat2, .(query, x), function(df){
df$projected <- df$x / 25 * unlist(baseRate[df$query])
df
})
ggplot(data = dat2, aes(
x = as.numeric(str_extract(nodes, "[0-9]+")),
y = `Cluster scan rate (rows/sec)` / 1e9,
color = query,
shape = query
)) +
# scale_y_log10(breaks = 2^(-1:5)) +
# scale_color_discrete(guide = guide_legend(title = "Query")) +
geom_point(size = 3) +
# geom_path(aes(y = projected)) +
geom_line(aes(y = projected / 1e9)) +
# scale_y_log10() +
xlab("Number of Nodes") +
ylab("Cluster Scan Rate (billion rows/sec.)")
ggsave("../figures/cluster_scan_rate.pdf", width = 4, height = 3)
## ggplot(data = dat, aes(
## x = as.numeric(str_extract(nodes, "[0-9]+")),
## y = `Core scan rate` / 1e6,
## shape = configuration,
## color = query,
## size = factor(cores, levels = c("4-core", "15-core"))
## )) +
## scale_size_manual(values = c(3, 6), guide = guide_legend(title = "Cores Per Node")) +
## scale_color_discrete(guide = guide_legend(title = "Query")) +
## scale_shape_discrete(guide = guide_legend(title = "Configuration")) +
## geom_point() +
## scale_y_log10(breaks = 2^(-1:5)) +
## xlab("Number of Nodes") +
## ylab("Core Scan Rate (million rows/sec.)")
ggplot(data = dat2, aes(
x = as.numeric(str_extract(nodes, "[0-9]+")),
y = `Core scan rate` / 1e6,
color = query,
shape = query
)) +
geom_point(size = 3) +
xlab("Number of Nodes") +
ylab("Core Scan Rate (million rows/sec.)")
ggsave("../figures/core_scan_rate.pdf", width = 4, height = 3)