diff --git a/benchmark/benchmain/main.go b/benchmark/benchmain/main.go
index 0d8a6961..0cc1f25e 100644
--- a/benchmark/benchmain/main.go
+++ b/benchmark/benchmain/main.go
@@ -22,14 +22,25 @@ Package main provides benchmark with setting flags.
 An example to run some benchmarks with profiling enabled:
 
 go run benchmark/benchmain/main.go -benchtime=10s -workloads=all \
-  -compression=on -maxConcurrentCalls=1 -traceMode=false \
-  -reqSizeBytes=1,1048576 -respSizeBytes=1,1048576 \
-  -latency=0s -kbps=0 -mtu=0 \
-  -cpuProfile=cpuProf -memProfile=memProf -memProfileRate=10000
+  -compression=on -maxConcurrentCalls=1 -trace=off \
+  -reqSizeBytes=1,1048576 -respSizeBytes=1,1048576 -networkMode=Local \
+  -cpuProfile=cpuProf -memProfile=memProf -memProfileRate=10000 -resultFile=result
+
+As a suggestion, when creating a branch, you can run this benchmark and save the result
+file "-resultFile=basePerf", and later when you at the middle of the work or finish the
+work, you can get the benchmark result and compare it with the base anytime.
+
+Assume there are two result files names as "basePerf" and "curPerf" created by adding
+-resultFile=basePerf and -resultFile=curPerf.
+	To format the curPerf, run:
+  	go run benchmark/benchresult/main.go curPerf
+	To observe how the performance changes based on a base result, run:
+  	go run benchmark/benchresult/main.go basePerf curPerf
 */
 package main
 
 import (
+	"encoding/gob"
 	"errors"
 	"flag"
 	"fmt"
@@ -58,12 +69,13 @@ import (
 )
 
 const (
-	compressionOn   = "on"
-	compressionOff  = "off"
-	compressionBoth = "both"
+	modeOn   = "on"
+	modeOff  = "off"
+	modeBoth = "both"
 )
 
-var allCompressionModes = []string{compressionOn, compressionOff, compressionBoth}
+var allCompressionModes = []string{modeOn, modeOff, modeBoth}
+var allTraceModes = []string{modeOn, modeOff, modeBoth}
 
 const (
 	workloadsUnary     = "unary"
@@ -83,26 +95,34 @@ var (
 	maxConcurrentCalls     = []int{1, 8, 64, 512}
 	reqSizeBytes           = []int{1, 1024, 1024 * 1024}
 	respSizeBytes          = []int{1, 1024, 1024 * 1024}
-	enableTrace            = []bool{false}
+	enableTrace            []bool
 	benchtime              time.Duration
 	memProfile, cpuProfile string
 	memProfileRate         int
 	enableCompressor       []bool
+	networkMode            string
+	benchmarkResultFile    string
+	networks               = map[string]latency.Network{
+		"Local":    latency.Local,
+		"LAN":      latency.LAN,
+		"WAN":      latency.WAN,
+		"Longhaul": latency.Longhaul,
+	}
 )
 
-func unaryBenchmark(startTimer func(), stopTimer func(int32), benchFeatures bm.Features, benchtime time.Duration, s *stats.Stats) {
+func unaryBenchmark(startTimer func(), stopTimer func(int32), benchFeatures stats.Features, benchtime time.Duration, s *stats.Stats) {
 	caller, close := makeFuncUnary(benchFeatures)
 	defer close()
 	runBenchmark(caller, startTimer, stopTimer, benchFeatures, benchtime, s)
 }
 
-func streamBenchmark(startTimer func(), stopTimer func(int32), benchFeatures bm.Features, benchtime time.Duration, s *stats.Stats) {
+func streamBenchmark(startTimer func(), stopTimer func(int32), benchFeatures stats.Features, benchtime time.Duration, s *stats.Stats) {
 	caller, close := makeFuncStream(benchFeatures)
 	defer close()
 	runBenchmark(caller, startTimer, stopTimer, benchFeatures, benchtime, s)
 }
 
-func makeFuncUnary(benchFeatures bm.Features) (func(int), func()) {
+func makeFuncUnary(benchFeatures stats.Features) (func(int), func()) {
 	nw := &latency.Network{Kbps: benchFeatures.Kbps, Latency: benchFeatures.Latency, MTU: benchFeatures.Mtu}
 	opts := []grpc.DialOption{}
 	sopts := []grpc.ServerOption{}
@@ -133,8 +153,7 @@ func makeFuncUnary(benchFeatures bm.Features) (func(int), func()) {
 		}
 }
 
-func makeFuncStream(benchFeatures bm.Features) (func(int), func()) {
-	fmt.Println(benchFeatures)
+func makeFuncStream(benchFeatures stats.Features) (func(int), func()) {
 	nw := &latency.Network{Kbps: benchFeatures.Kbps, Latency: benchFeatures.Latency, MTU: benchFeatures.Mtu}
 	opts := []grpc.DialOption{}
 	sopts := []grpc.ServerOption{}
@@ -185,7 +204,7 @@ func streamCaller(stream testpb.BenchmarkService_StreamingCallClient, reqSize, r
 	}
 }
 
-func runBenchmark(caller func(int), startTimer func(), stopTimer func(int32), benchFeatures bm.Features, benchtime time.Duration, s *stats.Stats) {
+func runBenchmark(caller func(int), startTimer func(), stopTimer func(int32), benchFeatures stats.Features, benchtime time.Duration, s *stats.Stats) {
 	// Warm up connection.
 	for i := 0; i < 10; i++ {
 		caller(0)
@@ -224,14 +243,14 @@ func runBenchmark(caller func(int), startTimer func(), stopTimer func(int32), be
 // Initiate main function to get settings of features.
 func init() {
 	var (
-		workloads, compressorMode, readLatency    string
-		readKbps, readMtu, readMaxConcurrentCalls intSliceType
-		readReqSizeBytes, readRespSizeBytes       intSliceType
-		traceMode                                 bool
+		workloads, traceMode, compressorMode, readLatency string
+		readKbps, readMtu, readMaxConcurrentCalls         intSliceType
+		readReqSizeBytes, readRespSizeBytes               intSliceType
 	)
 	flag.StringVar(&workloads, "workloads", workloadsAll,
 		fmt.Sprintf("Workloads to execute - One of: %v", strings.Join(allWorkloads, ", ")))
-	flag.BoolVar(&traceMode, "traceMode", false, "Enable gRPC tracing")
+	flag.StringVar(&traceMode, "trace", modeOff,
+		fmt.Sprintf("Trace mode - One of: %v", strings.Join(allTraceModes, ", ")))
 	flag.StringVar(&readLatency, "latency", "", "Simulated one-way network latency - may be a comma-separated list")
 	flag.DurationVar(&benchtime, "benchtime", time.Second, "Configures the amount of time to run each benchmark")
 	flag.Var(&readKbps, "kbps", "Simulated network throughput (in kbps) - may be a comma-separated list")
@@ -239,11 +258,15 @@ func init() {
 	flag.Var(&readMaxConcurrentCalls, "maxConcurrentCalls", "Number of concurrent RPCs during benchmarks")
 	flag.Var(&readReqSizeBytes, "reqSizeBytes", "Request size in bytes - may be a comma-separated list")
 	flag.Var(&readRespSizeBytes, "respSizeBytes", "Response size in bytes - may be a comma-separated list")
-	flag.StringVar(&memProfile, "memProfile", "", "Enables memory profiling output to the filename provided")
-	flag.IntVar(&memProfileRate, "memProfileRate", 0, "Configures the memory profiling rate")
+	flag.StringVar(&memProfile, "memProfile", "", "Enables memory profiling output to the filename provided.")
+	flag.IntVar(&memProfileRate, "memProfileRate", 512*1024, "Configures the memory profiling rate. \n"+
+		"memProfile should be set before setting profile rate. To include every allocated block in the profile, "+
+		"set MemProfileRate to 1. To turn off profiling entirely, set MemProfileRate to 0. 512 * 1024 by default.")
 	flag.StringVar(&cpuProfile, "cpuProfile", "", "Enables CPU profiling output to the filename provided")
-	flag.StringVar(&compressorMode, "compression", compressionOff,
+	flag.StringVar(&compressorMode, "compression", modeOff,
 		fmt.Sprintf("Compression mode - One of: %v", strings.Join(allCompressionModes, ", ")))
+	flag.StringVar(&benchmarkResultFile, "resultFile", "", "Save the benchmark result into a binary file")
+	flag.StringVar(&networkMode, "networkMode", "", "Network mode includes LAN, WAN, Local and Longhaul")
 	flag.Parse()
 	if flag.NArg() != 0 {
 		log.Fatal("Error: unparsed arguments: ", flag.Args())
@@ -262,20 +285,8 @@ func init() {
 		log.Fatalf("Unknown workloads setting: %v (want one of: %v)",
 			workloads, strings.Join(allWorkloads, ", "))
 	}
-	switch compressorMode {
-	case compressionOn:
-		enableCompressor = []bool{true}
-	case compressionOff:
-		enableCompressor = []bool{false}
-	case compressionBoth:
-		enableCompressor = []bool{false, true}
-	default:
-		log.Fatalf("Unknown compression mode setting: %v (want one of: %v)",
-			compressorMode, strings.Join(allCompressionModes, ", "))
-	}
-	if traceMode {
-		enableTrace = []bool{true}
-	}
+	enableCompressor = setMode(compressorMode)
+	enableTrace = setMode(traceMode)
 	// Time input formats as (time + unit).
 	readTimeFromInput(&ltc, readLatency)
 	readIntFromIntSlice(&kbps, readKbps)
@@ -283,6 +294,27 @@ func init() {
 	readIntFromIntSlice(&maxConcurrentCalls, readMaxConcurrentCalls)
 	readIntFromIntSlice(&reqSizeBytes, readReqSizeBytes)
 	readIntFromIntSlice(&respSizeBytes, readRespSizeBytes)
+	// Re-write latency, kpbs and mtu if network mode is set.
+	if network, ok := networks[networkMode]; ok {
+		ltc = []time.Duration{network.Latency}
+		kbps = []int{network.Kbps}
+		mtu = []int{network.MTU}
+	}
+}
+
+func setMode(name string) []bool {
+	switch name {
+	case modeOn:
+		return []bool{true}
+	case modeOff:
+		return []bool{false}
+	case modeBoth:
+		return []bool{false, true}
+	default:
+		log.Fatalf("Unknown %s setting: %v (want one of: %v)",
+			name, name, strings.Join(allCompressionModes, ", "))
+		return []bool{}
+	}
 }
 
 type intSliceType []int
@@ -334,6 +366,7 @@ func main() {
 		len(maxConcurrentCalls), len(reqSizeBytes), len(respSizeBytes), len(enableCompressor)}
 	initalPos := make([]int, len(featuresPos))
 	s := stats.NewStats(10)
+	s.SortLatency()
 	var memStats runtime.MemStats
 	var results testing.BenchmarkResult
 	var startAllocs, startBytes uint64
@@ -350,14 +383,19 @@ func main() {
 		results = testing.BenchmarkResult{N: int(count), T: time.Now().Sub(startTime),
 			Bytes: 0, MemAllocs: memStats.Mallocs - startAllocs, MemBytes: memStats.TotalAlloc - startBytes}
 	}
+	sharedPos := make([]bool, len(featuresPos))
+	for i := 0; i < len(featuresPos); i++ {
+		if featuresNum[i] <= 1 {
+			sharedPos[i] = true
+		}
+	}
+
 	// Run benchmarks
+	resultSlice := []stats.BenchResults{}
 	for !reflect.DeepEqual(featuresPos, initalPos) || start {
 		start = false
-		tracing := "Trace"
-		if !enableTrace[featuresPos[0]] {
-			tracing = "noTrace"
-		}
-		benchFeature := bm.Features{
+		benchFeature := stats.Features{
+			NetworkMode:        networkMode,
 			EnableTrace:        enableTrace[featuresPos[0]],
 			Latency:            ltc[featuresPos[1]],
 			Kbps:               kbps[featuresPos[2]],
@@ -370,28 +408,30 @@ func main() {
 
 		grpc.EnableTracing = enableTrace[featuresPos[0]]
 		if runMode[0] {
-			fmt.Printf("Unary-%s-%s:\n", tracing, benchFeature.String())
 			unaryBenchmark(startTimer, stopTimer, benchFeature, benchtime, s)
-			fmt.Println(results.String(), results.MemString())
+			s.SetBenchmarkResult("Unary", benchFeature, results.N,
+				results.AllocedBytesPerOp(), results.AllocsPerOp(), sharedPos)
+			fmt.Println(s.BenchString())
 			fmt.Println(s.String())
+			resultSlice = append(resultSlice, s.GetBenchmarkResults())
 			s.Clear()
 		}
-
 		if runMode[1] {
-			fmt.Printf("Stream-%s-%s\n", tracing, benchFeature.String())
 			streamBenchmark(startTimer, stopTimer, benchFeature, benchtime, s)
-			fmt.Println(results.String(), results.MemString())
+			s.SetBenchmarkResult("Stream", benchFeature, results.N,
+				results.AllocedBytesPerOp(), results.AllocsPerOp(), sharedPos)
+			fmt.Println(s.BenchString())
 			fmt.Println(s.String())
+			resultSlice = append(resultSlice, s.GetBenchmarkResults())
 			s.Clear()
 		}
 		bm.AddOne(featuresPos, featuresNum)
 	}
-	after()
-
+	after(resultSlice)
 }
 
 func before() {
-	if memProfileRate > 0 {
+	if memProfile != "" {
 		runtime.MemProfileRate = memProfileRate
 	}
 	if cpuProfile != "" {
@@ -408,7 +448,7 @@ func before() {
 	}
 }
 
-func after() {
+func after(data []stats.BenchResults) {
 	if cpuProfile != "" {
 		pprof.StopCPUProfile() // flushes profile to disk
 	}
@@ -420,11 +460,20 @@ func after() {
 		}
 		runtime.GC() // materialize all statistics
 		if err = pprof.WriteHeapProfile(f); err != nil {
-			fmt.Fprintf(os.Stderr, "testing: can't write %s: %s\n", memProfile, err)
+			fmt.Fprintf(os.Stderr, "testing: can't write heap profile %s: %s\n", memProfile, err)
 			os.Exit(2)
 		}
 		f.Close()
 	}
+	if benchmarkResultFile != "" {
+		f, err := os.Create(benchmarkResultFile)
+		if err != nil {
+			log.Fatalf("testing: can't write benchmark result %s: %s\n", benchmarkResultFile, err)
+		}
+		dataEncoder := gob.NewEncoder(f)
+		dataEncoder.Encode(data)
+		f.Close()
+	}
 }
 
 // nopCompressor is a compressor that just copies data.
diff --git a/benchmark/benchmark.go b/benchmark/benchmark.go
index d4a4d766..7c8578e8 100644
--- a/benchmark/benchmark.go
+++ b/benchmark/benchmark.go
@@ -39,24 +39,6 @@ import (
 	"google.golang.org/grpc/grpclog"
 )
 
-// Features contains most fields for a benchmark
-type Features struct {
-	EnableTrace        bool
-	Latency            time.Duration
-	Kbps               int
-	Mtu                int
-	MaxConcurrentCalls int
-	ReqSizeBytes       int
-	RespSizeBytes      int
-	EnableCompressor   bool
-}
-
-func (f Features) String() string {
-	return fmt.Sprintf("latency_%s-kbps_%#v-MTU_%#v-maxConcurrentCalls_"+
-		"%#v-reqSize_%#vB-respSize_%#vB-Compressor_%t",
-		f.Latency.String(), f.Kbps, f.Mtu, f.MaxConcurrentCalls, f.ReqSizeBytes, f.RespSizeBytes, f.EnableCompressor)
-}
-
 // AddOne add 1 to the features slice
 func AddOne(features []int, featuresMaxPosition []int) {
 	for i := len(features) - 1; i >= 0; i-- {
@@ -261,7 +243,7 @@ func NewClientConn(addr string, opts ...grpc.DialOption) *grpc.ClientConn {
 	return conn
 }
 
-func runUnary(b *testing.B, benchFeatures Features) {
+func runUnary(b *testing.B, benchFeatures stats.Features) {
 	s := stats.AddStats(b, 38)
 	nw := &latency.Network{Kbps: benchFeatures.Kbps, Latency: benchFeatures.Latency, MTU: benchFeatures.Mtu}
 	target, stopper := StartServer(ServerInfo{Addr: "localhost:0", Type: "protobuf", Network: nw}, grpc.MaxConcurrentStreams(uint32(benchFeatures.MaxConcurrentCalls+1)))
@@ -309,7 +291,7 @@ func runUnary(b *testing.B, benchFeatures Features) {
 	conn.Close()
 }
 
-func runStream(b *testing.B, benchFeatures Features) {
+func runStream(b *testing.B, benchFeatures stats.Features) {
 	s := stats.AddStats(b, 38)
 	nw := &latency.Network{Kbps: benchFeatures.Kbps, Latency: benchFeatures.Latency, MTU: benchFeatures.Mtu}
 	target, stopper := StartServer(ServerInfo{Addr: "localhost:0", Type: "protobuf", Network: nw}, grpc.MaxConcurrentStreams(uint32(benchFeatures.MaxConcurrentCalls+1)))
diff --git a/benchmark/benchmark16_test.go b/benchmark/benchmark16_test.go
index 4882ae18..fc33e80d 100644
--- a/benchmark/benchmark16_test.go
+++ b/benchmark/benchmark16_test.go
@@ -30,80 +30,81 @@ import (
 
 func BenchmarkClientStreamc1(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 1, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 1, 1, 1, false})
 }
 
 func BenchmarkClientStreamc8(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 8, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 8, 1, 1, false})
 }
 
 func BenchmarkClientStreamc64(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 64, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 64, 1, 1, false})
 }
 
 func BenchmarkClientStreamc512(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 512, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 512, 1, 1, false})
 }
 func BenchmarkClientUnaryc1(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 1, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 1, 1, 1, false})
 }
 
 func BenchmarkClientUnaryc8(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 8, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 8, 1, 1, false})
 }
 
 func BenchmarkClientUnaryc64(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 64, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 64, 1, 1, false})
 }
 
 func BenchmarkClientUnaryc512(b *testing.B) {
 	grpc.EnableTracing = true
-	runStream(b, Features{true, 0, 0, 0, 512, 1, 1, false})
+	runStream(b, stats.Features{"", true, 0, 0, 0, 512, 1, 1, false})
 }
 
 func BenchmarkClientStreamNoTracec1(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 1, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 1, 1, 1, false})
 }
 
 func BenchmarkClientStreamNoTracec8(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 8, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 8, 1, 1, false})
 }
 
 func BenchmarkClientStreamNoTracec64(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 64, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 64, 1, 1, false})
 }
 
 func BenchmarkClientStreamNoTracec512(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 512, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 512, 1, 1, false})
 }
 func BenchmarkClientUnaryNoTracec1(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 1, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 1, 1, 1, false})
 }
 
 func BenchmarkClientUnaryNoTracec8(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 8, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 8, 1, 1, false})
 }
 
 func BenchmarkClientUnaryNoTracec64(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 64, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 64, 1, 1, false})
 }
 
 func BenchmarkClientUnaryNoTracec512(b *testing.B) {
 	grpc.EnableTracing = false
-	runStream(b, Features{false, 0, 0, 0, 512, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 512, 1, 1, false})
+	runStream(b, stats.Features{"", false, 0, 0, 0, 512, 1, 1, false})
 }
 
 func TestMain(m *testing.M) {
diff --git a/benchmark/benchmark17_test.go b/benchmark/benchmark17_test.go
index df1bb431..8dc7d3c5 100644
--- a/benchmark/benchmark17_test.go
+++ b/benchmark/benchmark17_test.go
@@ -56,7 +56,7 @@ func BenchmarkClient(b *testing.B) {
 			tracing = "noTrace"
 		}
 
-		benchFeature := Features{
+		benchFeature := stats.Features{
 			EnableTrace:        enableTrace[featuresCurPos[0]],
 			Latency:            latency[featuresCurPos[1]],
 			Kbps:               kbps[featuresCurPos[2]],
diff --git a/benchmark/benchresult/main.go b/benchmark/benchresult/main.go
new file mode 100644
index 00000000..40226cff
--- /dev/null
+++ b/benchmark/benchresult/main.go
@@ -0,0 +1,133 @@
+/*
+ *
+ * Copyright 2017 gRPC authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+/*
+To format the benchmark result:
+  go run benchmark/benchresult/main.go resultfile
+
+To see the performance change based on a old result:
+  go run benchmark/benchresult/main.go resultfile_old resultfile
+It will print the comparison result of intersection benchmarks between two files.
+
+*/
+package main
+
+import (
+	"encoding/gob"
+	"fmt"
+	"log"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"google.golang.org/grpc/benchmark/stats"
+)
+
+func createMap(fileName string, m map[string]stats.BenchResults) {
+	f, err := os.Open(fileName)
+	if err != nil {
+		log.Fatalf("Read file %s error: %s\n", fileName, err)
+	}
+	defer f.Close()
+	var data []stats.BenchResults
+	decoder := gob.NewDecoder(f)
+	if err = decoder.Decode(&data); err != nil {
+		log.Fatalf("Decode file %s error: %s\n", fileName, err)
+	}
+	for _, d := range data {
+		m[d.RunMode+"-"+d.Features.String()] = d
+	}
+}
+
+func intChange(title string, val1, val2 int64) string {
+	return fmt.Sprintf("%10s %12s %12s %8.2f%%\n", title, strconv.FormatInt(val1, 10),
+		strconv.FormatInt(val2, 10), float64(val2-val1)*100/float64(val1))
+}
+
+func timeChange(title int, val1, val2 time.Duration) string {
+	return fmt.Sprintf("%10s %12s %12s %8.2f%%\n", strconv.Itoa(title)+" latency", val1.String(),
+		val2.String(), float64(val2-val1)*100/float64(val1))
+}
+
+func compareTwoMap(m1, m2 map[string]stats.BenchResults) {
+	for k2, v2 := range m2 {
+		if v1, ok := m1[k2]; ok {
+			changes := k2 + "\n"
+			changes += fmt.Sprintf("%10s %12s %12s %8s\n", "Title", "Before", "After", "Percentage")
+			changes += intChange("Bytes/op", v1.AllocedBytesPerOp, v2.AllocedBytesPerOp)
+			changes += intChange("Allocs/op", v1.AllocsPerOp, v2.AllocsPerOp)
+			changes += timeChange(v1.Latency[1].Percent, v1.Latency[1].Value, v2.Latency[1].Value)
+			changes += timeChange(v1.Latency[2].Percent, v1.Latency[2].Value, v2.Latency[2].Value)
+			fmt.Printf("%s\n", changes)
+		}
+	}
+}
+
+func compareBenchmark(file1, file2 string) {
+	var BenchValueFile1 map[string]stats.BenchResults
+	var BenchValueFile2 map[string]stats.BenchResults
+	BenchValueFile1 = make(map[string]stats.BenchResults)
+	BenchValueFile2 = make(map[string]stats.BenchResults)
+
+	createMap(file1, BenchValueFile1)
+	createMap(file2, BenchValueFile2)
+
+	compareTwoMap(BenchValueFile1, BenchValueFile2)
+}
+
+func printline(benchName, ltc50, ltc90, allocByte, allocsOp interface{}) {
+	fmt.Printf("%-80v%12v%12v%12v%12v\n", benchName, ltc50, ltc90, allocByte, allocsOp)
+}
+
+func formatBenchmark(fileName string) {
+	f, err := os.Open(fileName)
+	if err != nil {
+		log.Fatalf("Read file %s error: %s\n", fileName, err)
+	}
+	defer f.Close()
+	var data []stats.BenchResults
+	decoder := gob.NewDecoder(f)
+	if err = decoder.Decode(&data); err != nil {
+		log.Fatalf("Decode file %s error: %s\n", fileName, err)
+	}
+	if len(data) == 0 {
+		log.Fatalf("No data in file %s\n", fileName)
+	}
+	printPos := data[0].SharedPosion
+	fmt.Println("\nShared features:\n" + strings.Repeat("-", 20))
+	fmt.Print(stats.PartialPrintString(printPos, data[0].Features, true))
+	fmt.Println(strings.Repeat("-", 35))
+	for i := 0; i < len(data[0].SharedPosion); i++ {
+		printPos[i] = !printPos[i]
+	}
+	printline("Name", "latency-50", "latency-90", "Alloc (B)", "Alloc (#)")
+	for _, d := range data {
+		name := d.RunMode + stats.PartialPrintString(printPos, d.Features, false)
+		printline(name, d.Latency[1].Value.String(), d.Latency[2].Value.String(),
+			d.AllocedBytesPerOp, d.AllocsPerOp)
+	}
+}
+
+func main() {
+	if len(os.Args) == 2 {
+		formatBenchmark(os.Args[1])
+	} else {
+		compareBenchmark(os.Args[1], os.Args[2])
+	}
+}
diff --git a/benchmark/latency/latency.go b/benchmark/latency/latency.go
index 66a8a0d0..5839a5c4 100644
--- a/benchmark/latency/latency.go
+++ b/benchmark/latency/latency.go
@@ -63,6 +63,17 @@ type Network struct {
 	MTU     int           // Bytes per packet; if non-positive, infinite
 }
 
+var (
+	//Local simulates local network.
+	Local = Network{0, 0, 0}
+	//LAN simulates local area network network.
+	LAN = Network{100 * 1024, 2 * time.Millisecond, 1500}
+	//WAN simulates wide area network.
+	WAN = Network{20 * 1024, 30 * time.Millisecond, 1500}
+	//Longhaul simulates bad network.
+	Longhaul = Network{1000 * 1024, 200 * time.Millisecond, 9000}
+)
+
 // Conn returns a net.Conn that wraps c and injects n's latency into that
 // connection.  This function also imposes latency for connection creation.
 // If n's Latency is lower than the measured latency in c, an error is
diff --git a/benchmark/stats/stats.go b/benchmark/stats/stats.go
index 03569ae2..412daead 100644
--- a/benchmark/stats/stats.go
+++ b/benchmark/stats/stats.go
@@ -23,9 +23,132 @@ import (
 	"fmt"
 	"io"
 	"math"
+	"sort"
+	"strconv"
 	"time"
 )
 
+// Features contains most fields for a benchmark
+type Features struct {
+	NetworkMode        string
+	EnableTrace        bool
+	Latency            time.Duration
+	Kbps               int
+	Mtu                int
+	MaxConcurrentCalls int
+	ReqSizeBytes       int
+	RespSizeBytes      int
+	EnableCompressor   bool
+}
+
+// String returns the textual output of the Features as string.
+func (f Features) String() string {
+	return fmt.Sprintf("traceMode_%t-latency_%s-kbps_%#v-MTU_%#v-maxConcurrentCalls_"+
+		"%#v-reqSize_%#vB-respSize_%#vB-Compressor_%t", f.EnableTrace,
+		f.Latency.String(), f.Kbps, f.Mtu, f.MaxConcurrentCalls, f.ReqSizeBytes, f.RespSizeBytes, f.EnableCompressor)
+}
+
+// PartialPrintString can print certain features with different format.
+func PartialPrintString(noneEmptyPos []bool, f Features, shared bool) string {
+	s := ""
+	var (
+		prefix, suffix, linker string
+		isNetwork              bool
+	)
+	if shared {
+		suffix = "\n"
+		linker = ": "
+	} else {
+		prefix = "-"
+		linker = "_"
+	}
+	if noneEmptyPos[0] {
+		s += fmt.Sprintf("%sTrace%s%t%s", prefix, linker, f.EnableCompressor, suffix)
+	}
+	if shared && f.NetworkMode != "" {
+		s += fmt.Sprintf("Network: %s \n", f.NetworkMode)
+		isNetwork = true
+	}
+	if !isNetwork {
+		if noneEmptyPos[1] {
+			s += fmt.Sprintf("%slatency%s%s%s", prefix, linker, f.Latency.String(), suffix)
+		}
+		if noneEmptyPos[2] {
+			s += fmt.Sprintf("%skbps%s%#v%s", prefix, linker, f.Kbps, suffix)
+		}
+		if noneEmptyPos[3] {
+			s += fmt.Sprintf("%sMTU%s%#v%s", prefix, linker, f.Mtu, suffix)
+		}
+	}
+	if noneEmptyPos[4] {
+		s += fmt.Sprintf("%sCallers%s%#v%s", prefix, linker, f.MaxConcurrentCalls, suffix)
+	}
+	if noneEmptyPos[5] {
+		s += fmt.Sprintf("%sreqSize%s%#vB%s", prefix, linker, f.ReqSizeBytes, suffix)
+	}
+	if noneEmptyPos[6] {
+		s += fmt.Sprintf("%srespSize%s%#vB%s", prefix, linker, f.RespSizeBytes, suffix)
+	}
+	if noneEmptyPos[7] {
+		s += fmt.Sprintf("%sCompressor%s%t%s", prefix, linker, f.EnableCompressor, suffix)
+	}
+	return s
+}
+
+type percentLatency struct {
+	Percent int
+	Value   time.Duration
+}
+
+// BenchResults records features and result of a benchmark.
+type BenchResults struct {
+	RunMode           string
+	Features          Features
+	Latency           []percentLatency
+	Operations        int
+	NsPerOp           int64
+	AllocedBytesPerOp int64
+	AllocsPerOp       int64
+	SharedPosion      []bool
+}
+
+// SetBenchmarkResult sets features of benchmark and basic results.
+func (stats *Stats) SetBenchmarkResult(mode string, features Features, o int, allocdBytes, allocs int64, sharedPos []bool) {
+	stats.result.RunMode = mode
+	stats.result.Features = features
+	stats.result.Operations = o
+	stats.result.AllocedBytesPerOp = allocdBytes
+	stats.result.AllocsPerOp = allocs
+	stats.result.SharedPosion = sharedPos
+}
+
+// GetBenchmarkResults returns the result of the benchmark including features and result.
+func (stats *Stats) GetBenchmarkResults() BenchResults {
+	return stats.result
+}
+
+// BenchString output latency stats as the format as time + unit.
+func (stats *Stats) BenchString() string {
+	stats.maybeUpdate()
+	s := stats.result
+	res := s.RunMode + "-" + s.Features.String() + ": \n"
+	if len(s.Latency) != 0 {
+		var statsUnit = s.Latency[0].Value
+		var timeUnit = fmt.Sprintf("%v", statsUnit)[1:]
+		for i := 1; i < len(s.Latency)-1; i++ {
+			res += fmt.Sprintf("%d_Latency: %s %s \t", s.Latency[i].Percent,
+				strconv.FormatFloat(float64(s.Latency[i].Value)/float64(statsUnit), 'f', 4, 64), timeUnit)
+		}
+		res += fmt.Sprintf("Avg latency: %s %s \t",
+			strconv.FormatFloat(float64(s.Latency[len(s.Latency)-1].Value)/float64(statsUnit), 'f', 4, 64), timeUnit)
+	}
+	res += fmt.Sprintf("Count: %v \t", s.Operations)
+	res += fmt.Sprintf("%v Bytes/op\t", s.AllocedBytesPerOp)
+	res += fmt.Sprintf("%v Allocs/op\t", s.AllocsPerOp)
+
+	return res
+}
+
 // Stats is a simple helper for gathering additional statistics like histogram
 // during benchmarks. This is not thread safe.
 type Stats struct {
@@ -36,6 +159,9 @@ type Stats struct {
 
 	durations durationSlice
 	dirty     bool
+
+	sortLatency bool
+	result      BenchResults
 }
 
 type durationSlice []time.Duration
@@ -64,6 +190,18 @@ func (stats *Stats) Clear() {
 	stats.durations = stats.durations[:0]
 	stats.histogram = nil
 	stats.dirty = false
+	stats.result = BenchResults{}
+}
+
+//Sort method for durations
+func (a durationSlice) Len() int           { return len(a) }
+func (a durationSlice) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func (a durationSlice) Less(i, j int) bool { return a[i] < a[j] }
+func max(a, b int64) int64 {
+	if a > b {
+		return a
+	}
+	return b
 }
 
 // maybeUpdate updates internal stat data if there was any newly added
@@ -73,6 +211,12 @@ func (stats *Stats) maybeUpdate() {
 		return
 	}
 
+	if stats.sortLatency {
+		sort.Sort(stats.durations)
+		stats.min = int64(stats.durations[0])
+		stats.max = int64(stats.durations[len(stats.durations)-1])
+	}
+
 	stats.min = math.MaxInt64
 	stats.max = 0
 	for _, d := range stats.durations {
@@ -109,12 +253,28 @@ func (stats *Stats) maybeUpdate() {
 	}
 
 	stats.dirty = false
+
+	if stats.durations.Len() != 0 {
+		var percentToObserve = []int{50, 90}
+		// First data record min unit from the latency result.
+		stats.result.Latency = append(stats.result.Latency, percentLatency{Percent: -1, Value: stats.unit})
+		for _, position := range percentToObserve {
+			stats.result.Latency = append(stats.result.Latency, percentLatency{Percent: position, Value: stats.durations[max(stats.histogram.Count*int64(position)/100-1, 0)]})
+		}
+		// Last data record the average latency.
+		avg := float64(stats.histogram.Sum) / float64(stats.histogram.Count)
+		stats.result.Latency = append(stats.result.Latency, percentLatency{Percent: -1, Value: time.Duration(avg)})
+	}
+}
+
+// SortLatency blocks the output
+func (stats *Stats) SortLatency() {
+	stats.sortLatency = true
 }
 
 // Print writes textual output of the Stats.
 func (stats *Stats) Print(w io.Writer) {
 	stats.maybeUpdate()
-
 	if stats.histogram == nil {
 		fmt.Fprint(w, "Histogram (empty)\n")
 	} else {