From b405dafb89ec4560bacbcb64e5d4923db3a6b7ac Mon Sep 17 00:00:00 2001 From: Vikram Rangnekar Date: Wed, 8 May 2019 19:03:18 -0400 Subject: [PATCH] New low allocation fast json parsing and editing library --- json/filter.go | 163 ++++++++++++++++++++++++ json/get.go | 139 ++++++++++++++++++++ json/json_test.go | 317 ++++++++++++++++++++++++++++++++++++++++++++++ json/replace.go | 159 +++++++++++++++++++++++ json/strip.go | 118 +++++++++++++++++ 5 files changed, 896 insertions(+) create mode 100644 json/filter.go create mode 100644 json/get.go create mode 100644 json/json_test.go create mode 100644 json/replace.go create mode 100644 json/strip.go diff --git a/json/filter.go b/json/filter.go new file mode 100644 index 0000000..d8d01c1 --- /dev/null +++ b/json/filter.go @@ -0,0 +1,163 @@ +package json + +import ( + "bytes" + "crypto/sha1" +) + +func Filter(w *bytes.Buffer, b []byte, keys []string) error { + s := 0 + state := expectKey + var err error + + kmap := make(map[[20]byte]struct{}, len(keys)) + + for _, k := range keys { + h := sha1.Sum([]byte(k)) + if _, ok := kmap[h]; !ok { + kmap[h] = struct{}{} + } + } + + isList := false + item := 0 + field := 0 + for i := 0; i < len(b); i++ { + switch { + case state == expectKey: + switch b[i] { + case '[': + if !isList { + err = w.WriteByte('[') + } + isList = true + case '{': + if item == 0 { + err = w.WriteByte('{') + } else { + _, err = w.WriteString("},{") + } + item++ + field = 0 + case '"': + state = expectKeyClose + s = i + i++ + } + case state == expectKeyClose && b[i] == '"': + state = expectColon + i++ + } + + if err != nil { + return nil + } + + if state != expectColon { + continue + } + + k := b[(s + 1):(i - 1)] + h := sha1.Sum(k) + _, kf := kmap[h] + + e := 0 + d := 0 + for ; i < len(b); i++ { + if state == expectObjClose || state == expectListClose { + switch b[i] { + case '{', '[': + d++ + case '}', ']': + d-- + } + } + + switch { + case state == expectColon && b[i] == ':': + state = expectValue + + case state == expectValue && b[i] == '"': + state = expectString + + case state == expectString && b[i] == '"': + e = i + + case state == expectValue && b[i] == '[': + state = expectListClose + d++ + + case state == expectListClose && d == 0 && b[i] == ']': + e = i + + case state == expectValue && b[i] == '{': + state = expectObjClose + d++ + + case state == expectObjClose && d == 0 && b[i] == '}': + e = i + + case state == expectValue && (b[i] >= '0' && b[i] <= '9'): + state = expectNumClose + + case state == expectNumClose && + ((b[i] < '0' || b[i] > '9') && + (b[i] != '.' && b[i] != 'e' && b[i] != 'E' && b[i] != '+' && b[i] != '-')): + i-- + e = i + + case state == expectValue && + (b[i] == 'f' || b[i] == 'F' || b[i] == 't' || b[i] == 'T'): + state = expectBoolClose + + case state == expectBoolClose && (b[i] == 'e' || b[i] == 'E'): + e = i + } + + if e != 0 { + if kf { + if field != 0 { + if err := w.WriteByte(','); err != nil { + return err + } + } + + cb := b[s:(e + 1)] + sk := 0 + for i := 0; i < len(cb); i++ { + if cb[i] == '\n' || cb[i] == '\t' { + if _, err := w.Write(cb[sk:i]); err != nil { + return err + } + sk = i + 1 + } + } + + if sk > 0 && sk < len(cb) { + _, err = w.Write(cb[sk:len(cb)]) + } else { + _, err = w.Write(cb) + } + if err != nil { + return err + } + field++ + } + state = expectKey + break + } + } + } + if item != 0 { + if err := w.WriteByte('}'); err != nil { + return err + } + } + if isList { + if err := w.WriteByte(']'); err != nil { + return err + } + } + + return nil +} diff --git a/json/get.go b/json/get.go new file mode 100644 index 0000000..a9e5636 --- /dev/null +++ b/json/get.go @@ -0,0 +1,139 @@ +package json + +import ( + "crypto/sha1" +) + +const ( + expectKey int = iota + expectKeyClose + expectColon + expectValue + expectString + expectListClose + expectObjClose + expectBoolClose + expectNumClose +) + +type Field struct { + Key []byte + Value []byte +} + +func Get(b []byte, keys [][]byte) []Field { + s := 0 + state := expectKey + + kmap := make(map[[20]byte]struct{}, len(keys)) + + for _, k := range keys { + h := sha1.Sum(k) + if _, ok := kmap[h]; !ok { + kmap[h] = struct{}{} + } + } + + l := 10 + res := make([]Field, l) + + for i := 0; i < len(b); i++ { + switch { + case state == expectKey && b[i] == '"': + state = expectKeyClose + s = i + 1 + continue + + case state == expectKeyClose && b[i] == '"': + state = expectColon + } + + if state != expectColon { + continue + } + + k := b[s:i] + h := sha1.Sum(k) + _, kf := kmap[h] + + e := 0 + d := 0 + for ; i < len(b); i++ { + if state == expectObjClose || state == expectListClose { + switch b[i] { + case '{', '[': + d++ + case '}', ']': + d-- + } + } + + switch { + case state == expectColon && b[i] == ':': + state = expectValue + + case state == expectValue && b[i] == '"': + state = expectString + s = i + + case state == expectString && b[i] == '"': + e = i + + case state == expectValue && b[i] == '[': + state = expectListClose + s = i + d++ + + case state == expectListClose && d == 0 && b[i] == ']': + e = i + i = s + + case state == expectValue && b[i] == '{': + state = expectObjClose + s = i + d++ + + case state == expectObjClose && d == 0 && b[i] == '}': + e = i + i = s + + case state == expectValue && (b[i] >= '0' && b[i] <= '9'): + state = expectNumClose + s = i + + case state == expectNumClose && + ((b[i] < '0' || b[i] > '9') && + (b[i] != '.' && b[i] != 'e' && b[i] != 'E' && b[i] != '+' && b[i] != '-')): + i-- + e = i + + case state == expectValue && + (b[i] == 'f' || b[i] == 'F' || b[i] == 't' || b[i] == 'T'): + state = expectBoolClose + s = i + + case state == expectBoolClose && (b[i] == 'e' || b[i] == 'E'): + e = i + } + + if e != 0 { + e++ + + if kf { + if len(res) == cap(res) { + r := make([]Field, 0, (len(res) + l)) + copy(r, res) + res = r + } + + res = append(res, Field{k, b[s:e]}) + } + + state = expectKey + break + } + } + } + + return res +} diff --git a/json/json_test.go b/json/json_test.go new file mode 100644 index 0000000..d6dd467 --- /dev/null +++ b/json/json_test.go @@ -0,0 +1,317 @@ +package json + +import ( + "bytes" + "testing" +) + +var ( + input1 = ` + { + "data": { + "test": { "__twitter_id": "ABCD" }, + "users": [ + { + "id": 1, + "full_name": "Sidney Stroman", + "email": "user0@demo.com", + "__twitter_id": "2048666903444506956", + "embed": { + "id": 8, + "full_name": "Caroll Orn Sr.", + "email": "joannarau@hegmann.io", + "__twitter_id": "ABC123" + } + }, + { + "id": 2, + "full_name": "Jerry Dickinson", + "email": "user1@demo.com", + "__twitter_id": [{ "name": "hello" }, { "name": "world"}] + }, + { + "id": 3, + "full_name": "Kenna Cassin", + "email": "user2@demo.com", + "__twitter_id": { "name": "hello", "address": { "work": "1 infinity loop" } } + }, + { + "id": 4, + "full_name": "Mr. Pat Parisian", + "email": "__twitter_id", + "__twitter_id": 1234567890 + }, + { + "id": 5, + "full_name": "Bette Ebert", + "email": "janeenrath@goyette.com", + "__twitter_id": 1.23E + }, + { + "id": 6, + "full_name": "Everett Kiehn", + "email": "michael@bartoletti.com", + "__twitter_id": true + }, + { + "id": 7, + "full_name": "Katrina Cronin", + "email": "loretaklocko@framivolkman.org", + "__twitter_id": false + }, + { + "id": 8, + "full_name": "Caroll Orn Sr.", + "email": "joannarau@hegmann.io", + "__twitter_id": "2048666903444506956" + }, + { + "id": 9, + "full_name": "Gwendolyn Ziemann", + "email": "renaytoy@rutherford.co", + "__twitter_id": ["hello", "world"] + }, + { + "id": 10, + "full_name": "Mrs. Rosann Fritsch", + "email": "holliemosciski@thiel.org", + "__twitter_id": "2048666903444506956" + }, + { + "id": 11, + "full_name": "Arden Koss", + "email": "cristobalankunding@howewelch.org", + "__twitter_id": "2048666903444506956" + }, + { + "id": 12, + "full_name": "Brenton Bauch PhD", + "email": "renee@miller.co", + "__twitter_id": 1 + }, + { + "id": 13, + "full_name": "Daine Gleichner", + "email": "andrea@gmail.com", + "__twitter_id": "", + "id__twitter_id": "NOOO", + "work_email": "andrea@nienow.co" + } + ]} + }` + + input2 = ` + [{ + "id": 1, + "full_name": "Sidney Stroman", + "email": "user0@demo.com", + "__twitter_id": "2048666903444506956", + "embed": { + "id": 8, + "full_name": "Caroll Orn Sr.", + "email": "joannarau@hegmann.io", + "__twitter_id": "ABC123" + } + }, + { + "m": 1, + "id": 2, + "full_name": "Jerry Dickinson", + "email": "user1@demo.com", + "__twitter_id": [{ "name": "hello" }, { "name": "world"}] + }]` + + input3 = ` + { + "data": { + "test": { "__twitter_id": "ABCD" }, + "users": [{"id":1,"embed":{"id":8}},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6},{"id":7},{"id":8},{"id":9},{"id":10},{"id":11},{"id":12},{"id":13}] + } + }` + + input4 = ` + [{ + "id": 1, + "full_name": "Sidney Stroman", + "email": "user0@demo.com", + "__twitter_id": "2048666903444506956", + "embed": { + "id": 8, + "full_name": "Caroll Orn Sr.", + "email": "joannarau@hegmann.io", + "__twitter_id": "ABC123" + } + }, + { + "m": 1, + "id": 2, + "full_name": "Jerry Dickinson", + "email": "user1@demo.com", + "__twitter_id": [{ "name": "hello" }, { "name": "world"}] + }]` +) + +func TestGet(t *testing.T) { + values := Get([]byte(input1), [][]byte{ + []byte("__twitter_id"), + []byte("work_email"), + }) + + expected := []Field{ + {[]byte("__twitter_id"), []byte(`"ABCD"`)}, + {[]byte("__twitter_id"), []byte(`"2048666903444506956"`)}, + {[]byte("__twitter_id"), []byte(`"ABC123"`)}, + {[]byte("__twitter_id"), + []byte(`[{ "name": "hello" }, { "name": "world"}]`)}, + {[]byte("__twitter_id"), + []byte(`{ "name": "hello", "address": { "work": "1 infinity loop" } }`), + }, + {[]byte("__twitter_id"), []byte(`1234567890`)}, + {[]byte("__twitter_id"), []byte(`1.23E`)}, + {[]byte("__twitter_id"), []byte(`true`)}, + {[]byte("__twitter_id"), []byte(`false`)}, + {[]byte("__twitter_id"), []byte(`"2048666903444506956"`)}, + {[]byte("__twitter_id"), []byte(`["hello", "world"]`)}, + {[]byte("__twitter_id"), []byte(`"2048666903444506956"`)}, + {[]byte("__twitter_id"), []byte(`"2048666903444506956"`)}, + {[]byte("__twitter_id"), []byte(`1`)}, + {[]byte("__twitter_id"), []byte(`""`)}, + {[]byte("work_email"), []byte(`"andrea@nienow.co"`)}, + } + + if len(values) != len(expected) { + t.Fatal("len(values) != len(expected)") + } + + for i := range expected { + if bytes.Equal(values[i].Key, expected[i].Key) == false { + t.Error(string(values[i].Key), " != ", string(expected[i].Key)) + } + + if bytes.Equal(values[i].Value, expected[i].Value) == false { + t.Error(string(values[i].Value), " != ", string(expected[i].Value)) + } + } +} + +func TestFilter(t *testing.T) { + var b bytes.Buffer + Filter(&b, []byte(input2), []string{"id", "full_name", "embed"}) + + expected := `[{"id": 1,"full_name": "Sidney Stroman","embed": {"id": 8,"full_name": "Caroll Orn Sr.","email": "joannarau@hegmann.io","__twitter_id": "ABC123"}},{"id": 2,"full_name": "Jerry Dickinson"}]` + + if b.String() != expected { + t.Error("Does not match expected json") + } +} + +func TestStrip(t *testing.T) { + value := Strip([]byte(input3), []string{"data", "users"}) + + expected := []byte(`[{"id":1,"embed":{"id":8}},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6},{"id":7},{"id":8},{"id":9},{"id":10},{"id":11},{"id":12},{"id":13}]`) + + if bytes.Equal(value, expected) == false { + t.Log(value) + t.Error("Does not match expected json") + } +} + +func TestReplace(t *testing.T) { + var buf bytes.Buffer + + from := []Field{ + {[]byte("__twitter_id"), []byte(`[{ "name": "hello" }, { "name": "world"}]`)}, + {[]byte("__twitter_id"), []byte(`"ABC123"`)}, + } + + to := []Field{ + {[]byte("__twitter_id"), []byte(`"1234567890"`)}, + {[]byte("some_list"), []byte(`[{"id":1,"embed":{"id":8}},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6},{"id":7},{"id":8},{"id":9},{"id":10},{"id":11},{"id":12},{"id":13}]`)}, + } + + expected := `[{ + "id": 1, + "full_name": "Sidney Stroman", + "email": "user0@demo.com", + "__twitter_id": "2048666903444506956", + "embed": { + "id": 8, + "full_name": "Caroll Orn Sr.", + "email": "joannarau@hegmann.io", + "some_list":[{"id":1,"embed":{"id":8}},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6},{"id":7},{"id":8},{"id":9},{"id":10},{"id":11},{"id":12},{"id":13}] + } + }, + { + "m": 1, + "id": 2, + "full_name": "Jerry Dickinson", + "email": "user1@demo.com", + "__twitter_id":"1234567890" + }]` + + err := Replace(&buf, []byte(input4), from, to) + if err != nil { + t.Fatal(err) + } + + if buf.String() != expected { + t.Error("Does not match expected json") + } +} + +func BenchmarkGet(b *testing.B) { + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + Get([]byte(input1), [][]byte{[]byte("__twitter_id")}) + } +} + +func BenchmarkFilter(b *testing.B) { + var buf bytes.Buffer + + keys := []string{"id", "full_name", "embed", "email", "__twitter_id"} + b.ResetTimer() + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + err := Filter(&buf, []byte(input2), keys) + if err != nil { + b.Fatal(err) + } + buf.Reset() + } +} + +func BenchmarkStrip(b *testing.B) { + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + Strip([]byte(input3), []string{"data", "users"}) + } +} + +func BenchmarkReplace(b *testing.B) { + var buf bytes.Buffer + + from := []Field{ + {[]byte("__twitter_id"), []byte(`[{ "name": "hello" }, { "name": "world"}]`)}, + {[]byte("__twitter_id"), []byte(`"ABC123"`)}, + } + + to := []Field{ + {[]byte("__twitter_id"), []byte(`"1234567890"`)}, + {[]byte("some_list"), []byte(`[{"id":1,"embed":{"id":8}},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6},{"id":7},{"id":8},{"id":9},{"id":10},{"id":11},{"id":12},{"id":13}]`)}, + } + + b.ResetTimer() + b.ReportAllocs() + + for n := 0; n < b.N; n++ { + err := Replace(&buf, []byte(input4), from, to) + if err != nil { + b.Fatal(err) + } + buf.Reset() + } +} diff --git a/json/replace.go b/json/replace.go new file mode 100644 index 0000000..197e3e6 --- /dev/null +++ b/json/replace.go @@ -0,0 +1,159 @@ +package json + +import ( + "bytes" + "crypto/sha1" + "errors" +) + +func Replace(w *bytes.Buffer, b []byte, from, to []Field) error { + if len(from) != len(to) { + return errors.New("'from' and 'to' must be of the same length") + } + + state := expectKey + ws, we := 0, len(b) + + s := 0 + fi := -1 + + fmap := make(map[[20]byte]int, (len(from) * 2)) + tmap := make(map[[20]byte]int, (len(from))) + + for i, f := range from { + h1 := sha1.Sum(f.Key) + n, ok := fmap[h1] + if !ok { + fmap[h1] = i + n = i + } + + h2 := sha1.Sum(f.Value) + fmap[h2] = n + tmap[h2] = i + } + + for i := 0; i < len(b); i++ { + switch { + case ws == 0 && b[i] == '{' || b[i] == '[': + ws = i + + case state == expectKey && b[i] == '"': + state = expectKeyClose + s = i + 1 + continue + + case state == expectKeyClose && b[i] == '"': + state = expectColon + + default: + continue + } + + if state != expectColon { + continue + } + + h1 := sha1.Sum(b[s:i]) + if n, ok := fmap[h1]; ok { + we = s + fi = n + } + + e := 0 + d := 0 + for ; i < len(b); i++ { + if state == expectObjClose || state == expectListClose { + switch b[i] { + case '{', '[': + d++ + case '}', ']': + d-- + } + } + + switch { + case state == expectColon && b[i] == ':': + state = expectValue + + case state == expectValue && b[i] == '"': + state = expectString + s = i + + case state == expectString && b[i] == '"': + e = i + + case state == expectValue && b[i] == '[': + state = expectListClose + s = i + d++ + + case state == expectListClose && d == 0 && b[i] == ']': + e = i + + case state == expectValue && b[i] == '{': + state = expectObjClose + s = i + d++ + + case state == expectObjClose && d == 0 && b[i] == '}': + e = i + + case state == expectValue && (b[i] >= '0' && b[i] <= '9'): + state = expectNumClose + s = i + + case state == expectNumClose && + ((b[i] < '0' || b[i] > '9') && + (b[i] != '.' && b[i] != 'e' && b[i] != 'E' && b[i] != '+' && b[i] != '-')): + i-- + e = i + + case state == expectValue && + (b[i] == 'f' || b[i] == 'F' || b[i] == 't' || b[i] == 'T'): + state = expectBoolClose + s = i + + case state == expectBoolClose && (b[i] == 'e' || b[i] == 'E'): + e = i + } + + if e != 0 { + e++ + h2 := sha1.Sum(b[s:e]) + + if n, ok1 := fmap[h2]; ok1 && n == fi { + ti, ok2 := tmap[h2] + + if ok2 { + if _, err := w.Write(b[ws:we]); err != nil { + return err + } + if _, err := w.Write(to[ti].Key); err != nil { + return err + } + if _, err := w.WriteString(`":`); err != nil { + return err + } + if _, err := w.Write(to[ti].Value); err != nil { + return err + } + ws = e + } + } + + if ws != e && (b[s] == '[' || b[s] == '{') { + i = s - 1 + } + + state = expectKey + we = len(b) + fi = -1 + break + } + } + } + + w.Write(b[ws:we]) + return nil +} diff --git a/json/strip.go b/json/strip.go new file mode 100644 index 0000000..19392be --- /dev/null +++ b/json/strip.go @@ -0,0 +1,118 @@ +package json + +import ( + "bytes" +) + +func Strip(b []byte, path []string) []byte { + s := 0 + state := expectKey + + kb := make([][]byte, 0, len(path)) + ki := 0 + for _, k := range path { + kb = append(kb, []byte(k)) + } + + for i := 0; i < len(b); i++ { + switch { + case state == expectKey && b[i] == '"': + state = expectKeyClose + s = i + 1 + continue + + case state == expectKeyClose && b[i] == '"': + state = expectColon + } + + if state != expectColon { + continue + } + + if ki >= len(kb) { + return nil + } + + if !bytes.Equal(b[s:i], kb[ki]) { + state = expectKey + continue + } + + ki++ + + e := 0 + d := 0 + s := 0 + for ; i < len(b); i++ { + if state == expectObjClose || state == expectListClose { + switch b[i] { + case '{', '[': + d++ + case '}', ']': + d-- + } + } + + switch { + case state == expectColon && b[i] == ':': + state = expectValue + + case state == expectValue && b[i] == '"': + state = expectString + s = i + + case state == expectString && b[i] == '"': + e = i + + case state == expectValue && b[i] == '[': + state = expectListClose + s = i + d++ + + case state == expectListClose && d == 0 && b[i] == ']': + e = i + + case state == expectValue && b[i] == '{': + state = expectObjClose + s = i + d++ + + case state == expectObjClose && d == 0 && b[i] == '}': + e = i + + case state == expectValue && (b[i] >= '0' && b[i] <= '9'): + state = expectNumClose + s = i + + case state == expectNumClose && + ((b[i] < '0' || b[i] > '9') && + (b[i] != '.' && b[i] != 'e' && b[i] != 'E' && b[i] != '+' && b[i] != '-')): + i-- + e = i + + case state == expectValue && + (b[i] == 'f' || b[i] == 'F' || b[i] == 't' || b[i] == 'T'): + state = expectBoolClose + s = i + + case state == expectBoolClose && (b[i] == 'e' || b[i] == 'E'): + e = i + } + + if e != 0 && (b[s] == '[' || b[s] == '{') { + e++ + b = b[s:e] + i = 0 + + if ki == len(kb) { + return b + } + + state = expectKey + break + } + } + } + + return nil +}