feat: basic zim support
arcad/edge/pipeline/head There was a failure building this commit
Details
arcad/edge/pipeline/head There was a failure building this commit
Details
This commit is contained in:
parent
17808d14c9
commit
f3c5eee8c8
15
go.mod
15
go.mod
|
@ -1,10 +1,13 @@
|
|||
module forge.cadoles.com/arcad/edge
|
||||
|
||||
go 1.19
|
||||
go 1.20
|
||||
|
||||
require (
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.3
|
||||
github.com/hashicorp/mdns v1.0.5
|
||||
github.com/klauspost/compress v1.16.6
|
||||
github.com/lestrrat-go/jwx/v2 v2.0.8
|
||||
github.com/ulikunitz/xz v0.5.11
|
||||
modernc.org/sqlite v1.20.4
|
||||
)
|
||||
|
||||
|
@ -59,12 +62,12 @@ require (
|
|||
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
|
||||
gitlab.com/wpetit/goweb v0.0.0-20230419082146-a94d9ed7202b
|
||||
go.opencensus.io v0.22.5 // indirect
|
||||
golang.org/x/crypto v0.7.0
|
||||
golang.org/x/crypto v0.10.0
|
||||
golang.org/x/mod v0.10.0
|
||||
golang.org/x/net v0.9.0 // indirect
|
||||
golang.org/x/sys v0.7.0 // indirect
|
||||
golang.org/x/term v0.7.0 // indirect
|
||||
golang.org/x/text v0.9.0 // indirect
|
||||
golang.org/x/net v0.11.0
|
||||
golang.org/x/sys v0.9.0 // indirect
|
||||
golang.org/x/term v0.9.0 // indirect
|
||||
golang.org/x/text v0.10.0 // indirect
|
||||
golang.org/x/tools v0.8.0 // indirect
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0
|
||||
|
|
26
go.sum
26
go.sum
|
@ -188,6 +188,8 @@ github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad
|
|||
github.com/hashicorp/go.net v0.0.0-20151006203346-104dcad90073/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90=
|
||||
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
|
||||
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.3 h1:kmRrRLlInXvng0SmLxmQpQkpbYAvcXm7NPDrgxJa9mE=
|
||||
github.com/hashicorp/golang-lru/v2 v2.0.3/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
|
||||
github.com/hashicorp/mdns v0.0.0-20151206042412-9d85cf22f9f8/go.mod h1:aa76Av3qgPeIQp9Y3qIkTBPieQYNkQ13Kxe7pze9Wb0=
|
||||
github.com/hashicorp/mdns v1.0.5 h1:1M5hW1cunYeoXOqHwEb/GBDDHAFo0Yqb/uz/beC6LbE=
|
||||
|
@ -202,6 +204,8 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X
|
|||
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs=
|
||||
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/klauspost/compress v1.16.6 h1:91SKEy4K37vkp255cJ8QesJhjyRO0hn9i9G0GoUwLsk=
|
||||
github.com/klauspost/compress v1.16.6/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
|
||||
|
@ -277,6 +281,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
|
|||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
|
||||
github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
|
||||
github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
|
||||
github.com/urfave/cli/v2 v2.24.3 h1:7Q1w8VN8yE0MJEHP06bv89PjYsN4IHWED2s1v/Zlfm0=
|
||||
github.com/urfave/cli/v2 v2.24.3/go.mod h1:GHupkWPMM0M/sj1a2b4wUrWBPzazNrIjouW6fmdJLxc=
|
||||
|
@ -306,8 +312,8 @@ golang.org/x/crypto v0.0.0-20191206172530-e9b2fee46413/go.mod h1:LzIPMQfyMNhhGPh
|
|||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/crypto v0.0.0-20220427172511-eb4f295cb31f/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||
golang.org/x/crypto v0.7.0 h1:AvwMYaRytfdeVt3u6mLaxYtErKYjxA2OXjJ1HHq6t3A=
|
||||
golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU=
|
||||
golang.org/x/crypto v0.10.0 h1:LKqV2xt9+kDzSTfOhx4FrkEBcMrAgHSYgzywV9zcGmM=
|
||||
golang.org/x/crypto v0.10.0/go.mod h1:o4eNf7Ede1fv+hwOwZsTHl9EsPFO6q6ZvYR8vYfY45I=
|
||||
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
|
||||
|
@ -379,8 +385,8 @@ golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qx
|
|||
golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
|
||||
golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM=
|
||||
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
||||
golang.org/x/net v0.11.0 h1:Gi2tvZIJyBtO9SDr1q9h5hEQCp/4L2RQ+ar0qjx2oNU=
|
||||
golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ=
|
||||
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
|
||||
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
|
||||
|
@ -442,13 +448,13 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
|
|||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU=
|
||||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.9.0 h1:KS/R3tvhPqvJvwcKfnBHJwwthS11LRhmM5D59eEXa0s=
|
||||
golang.org/x/sys v0.9.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.3.0/go.mod h1:q750SLmJuPmVoN1blW3UFBPREJfb1KmY3vwxfr+nFDA=
|
||||
golang.org/x/term v0.7.0 h1:BEvjmm5fURWqcfbSKTdpkDXYBrUS1c0m8agp14W48vQ=
|
||||
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
|
||||
golang.org/x/term v0.9.0 h1:GRRCnKYhdQrD8kfRAdQ6Zcw1P0OcELxGLKJvtjVMZ28=
|
||||
golang.org/x/term v0.9.0/go.mod h1:M6DEAAIenWoTxdKrOltXcmDY3rSplQUkrvaDU5FcQyo=
|
||||
golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
|
@ -458,8 +464,8 @@ golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
|||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/text v0.10.0 h1:UpjohKhiEgNc0CSauXmwYftY1+LlaC75SJwh0SgCX58=
|
||||
golang.org/x/text v0.10.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
|
||||
|
|
|
@ -13,6 +13,7 @@ type ArchiveExt string
|
|||
const (
|
||||
ExtZip ArchiveExt = "zip"
|
||||
ExtTarGz ArchiveExt = "tar.gz"
|
||||
ExtZim ArchiveExt = "zim"
|
||||
)
|
||||
|
||||
func FromPath(path string) (Bundle, error) {
|
||||
|
@ -56,5 +57,14 @@ func matchArchivePattern(archivePath string) (Bundle, error) {
|
|||
return NewZipBundle(archivePath), nil
|
||||
}
|
||||
|
||||
matches, err = filepath.Match(fmt.Sprintf("*.%s", ExtZim), base)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "could not match file archive '%s'", archivePath)
|
||||
}
|
||||
|
||||
if matches {
|
||||
return NewZimBundle(archivePath), nil
|
||||
}
|
||||
|
||||
return nil, errors.WithStack(ErrUnknownBundleArchiveExt)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,283 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
RedirectEntry uint16 = 0xffff
|
||||
LinkTargetEntry = 0xfffe
|
||||
DeletedEntry = 0xfffd
|
||||
)
|
||||
|
||||
var articlePool sync.Pool
|
||||
|
||||
// the recent uncompressed blobs, mainly useful while indexing and asking
|
||||
// for the same blob again and again
|
||||
var bcache *lru.Cache[any, any]
|
||||
|
||||
type Article struct {
|
||||
// EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx
|
||||
// pointing to ZimReader.mimeTypeList
|
||||
EntryType uint16
|
||||
Title string
|
||||
URLPtr uint64
|
||||
Namespace byte
|
||||
url string
|
||||
blob uint32
|
||||
cluster uint32
|
||||
z *ZimReader
|
||||
}
|
||||
|
||||
// convenient method to return the Article at URL index idx
|
||||
func (z *ZimReader) ArticleAtURLIdx(idx uint32) (*Article, error) {
|
||||
o, err := z.OffsetAtURLIdx(idx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return z.ArticleAt(o)
|
||||
}
|
||||
|
||||
// return the article main page if it exists
|
||||
func (z *ZimReader) MainPage() (*Article, error) {
|
||||
if z.mainPage == 0xffffffff {
|
||||
return nil, nil
|
||||
}
|
||||
return z.ArticleAtURLIdx(z.mainPage)
|
||||
}
|
||||
|
||||
// get the article (Directory) pointed by the offset found in URLpos or Titlepos
|
||||
func (z *ZimReader) ArticleAt(offset uint64) (*Article, error) {
|
||||
a := articlePool.Get().(*Article)
|
||||
err := z.FillArticleAt(a, offset)
|
||||
return a, err
|
||||
}
|
||||
|
||||
// Fill an article with datas found at offset
|
||||
func (z *ZimReader) FillArticleAt(a *Article, offset uint64) error {
|
||||
a.z = z
|
||||
a.URLPtr = offset
|
||||
|
||||
mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2))
|
||||
if err != nil {
|
||||
return fmt.Errorf("can't read article %w", err)
|
||||
}
|
||||
a.EntryType = mimeIdx
|
||||
|
||||
// Linktarget or Target Entry
|
||||
if mimeIdx == LinkTargetEntry || mimeIdx == DeletedEntry {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
s, err := z.bytesRangeAt(offset+3, offset+4)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a.Namespace = s[0]
|
||||
|
||||
a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Redirect
|
||||
if mimeIdx == RedirectEntry {
|
||||
// assume the url + title won't be longer than 2k
|
||||
b, err := z.bytesRangeAt(offset+12, offset+12+2048)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
bbuf := bytes.NewBuffer(b)
|
||||
a.url, err = bbuf.ReadString('\x00')
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a.url = strings.TrimRight(a.url, "\x00")
|
||||
|
||||
a.Title, err = bbuf.ReadString('\x00')
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
a.Title = strings.TrimRight(a.Title, "\x00")
|
||||
return err
|
||||
}
|
||||
|
||||
b, err := z.bytesRangeAt(offset+16, offset+16+2048)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
bbuf := bytes.NewBuffer(b)
|
||||
a.url, err = bbuf.ReadString('\x00')
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
a.url = strings.TrimRight(string(a.url), "\x00")
|
||||
|
||||
title, err := bbuf.ReadString('\x00')
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
title = strings.TrimRight(string(title), "\x00")
|
||||
// This is a trick to force a copy and avoid retain of the full buffer
|
||||
// mainly for indexing title reasons
|
||||
if len(title) != 0 {
|
||||
a.Title = title[0:1] + title[1:]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// return the uncompressed data associated with this article
|
||||
func (a *Article) Data() ([]byte, error) {
|
||||
// ensure we have data to read
|
||||
if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
|
||||
return nil, nil
|
||||
}
|
||||
start, end, err := a.z.clusterOffsetsAtIdx(a.cluster)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s, err := a.z.bytesRangeAt(start, start+1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
compression := uint8(s[0])
|
||||
|
||||
// blob starts at offset, blob ends at offset
|
||||
var bs, be uint32
|
||||
|
||||
// LZMA: 4, Zstandard: 5
|
||||
if compression == 4 || compression == 5 {
|
||||
blobLookup := func() ([]byte, bool) {
|
||||
if v, ok := bcache.Get(a.cluster); ok {
|
||||
b := v.([]byte)
|
||||
return b, ok
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
var blob []byte
|
||||
var ok bool
|
||||
var dec io.ReadCloser
|
||||
if blob, ok = blobLookup(); !ok {
|
||||
b, err := a.z.bytesRangeAt(start+1, end+1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bbuf := bytes.NewBuffer(b)
|
||||
switch compression {
|
||||
case 5:
|
||||
dec, err = NewZstdReader(bbuf)
|
||||
|
||||
case 4:
|
||||
dec, err = NewXZReader(bbuf)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer dec.Close()
|
||||
// the decoded chunk are around 1MB
|
||||
b, err = ioutil.ReadAll(dec)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
blob = make([]byte, len(b))
|
||||
copy(blob, b)
|
||||
// TODO: 2 requests for the same blob could occure at the same time
|
||||
bcache.Add(a.cluster, blob)
|
||||
} else {
|
||||
bi, ok := bcache.Get(a.cluster)
|
||||
if !ok {
|
||||
return nil, errors.New("not in cache anymore")
|
||||
}
|
||||
blob = bi.([]byte)
|
||||
}
|
||||
|
||||
bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// avoid retaining all the chunk
|
||||
c := make([]byte, be-bs)
|
||||
copy(c, blob[bs:be])
|
||||
return c, nil
|
||||
|
||||
} else if compression == 0 || compression == 1 {
|
||||
// uncompresssed
|
||||
startPos := start + 1
|
||||
blobOffset := uint64(a.blob * 4)
|
||||
|
||||
bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be))
|
||||
}
|
||||
|
||||
return nil, errors.New("Unhandled compression")
|
||||
}
|
||||
|
||||
func (a *Article) MimeType() string {
|
||||
if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
|
||||
return ""
|
||||
}
|
||||
|
||||
return a.z.mimeTypeList[a.EntryType]
|
||||
}
|
||||
|
||||
// return the url prefixed by the namespace
|
||||
func (a *Article) FullURL() string {
|
||||
return string(a.Namespace) + "/" + a.url
|
||||
}
|
||||
|
||||
func (a *Article) String() string {
|
||||
return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x",
|
||||
a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob)
|
||||
}
|
||||
|
||||
// RedirectIndex return the redirect index of RedirectEntry type article
|
||||
// return an err if not a redirect entry
|
||||
func (a *Article) RedirectIndex() (uint32, error) {
|
||||
if a.EntryType != RedirectEntry {
|
||||
return 0, errors.New("Not a RedirectEntry")
|
||||
}
|
||||
// We use the cluster to save the redirect index position for RedirectEntry type
|
||||
return a.cluster, nil
|
||||
}
|
||||
|
||||
func (a *Article) blobOffsetsAtIdx(z *ZimReader) (start, end uint64) {
|
||||
idx := a.blob
|
||||
offset := z.clusterPtrPos + uint64(idx)*8
|
||||
start, err := readInt64(z.bytesRangeAt(offset, offset+8))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
offset = z.clusterPtrPos + uint64(idx+1)*8
|
||||
end, _ = readInt64(z.bytesRangeAt(offset, offset+8))
|
||||
|
||||
return
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
package zim
|
||||
|
||||
import "errors"
|
||||
|
||||
var ErrNotFound = errors.New("not found")
|
|
@ -0,0 +1,49 @@
|
|||
package zim
|
||||
|
||||
import "github.com/pkg/errors"
|
||||
|
||||
func (z *ZimReader) Favicon() (*Article, error) {
|
||||
illustration, err := z.getMetadataIllustration()
|
||||
if err != nil && !errors.Is(err, ErrNotFound) {
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
if illustration != nil {
|
||||
return illustration, nil
|
||||
}
|
||||
|
||||
namespaces := []string{"-", "I"}
|
||||
entryNames := []string{"favicon", "favicon.png"}
|
||||
|
||||
for _, ns := range namespaces {
|
||||
for _, en := range entryNames {
|
||||
article, err := z.GetPageNoIndex(ns + "/" + en)
|
||||
if err != nil && !errors.Is(err, ErrNotFound) {
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
if article != nil {
|
||||
return article, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, errors.WithStack(ErrNotFound)
|
||||
}
|
||||
|
||||
func (z *ZimReader) getMetadataIllustration() (*Article, error) {
|
||||
metadata, err := z.Metadata(MetadataIllustration96x96at2, MetadataIllustration48x48at1)
|
||||
if err != nil {
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
if _, exists := metadata[MetadataIllustration96x96at2]; exists {
|
||||
return z.GetPageNoIndex("M/" + string(MetadataIllustration96x96at2))
|
||||
}
|
||||
|
||||
if _, exists := metadata[MetadataIllustration48x48at1]; exists {
|
||||
return z.GetPageNoIndex("M/" + string(MetadataIllustration48x48at1))
|
||||
}
|
||||
|
||||
return nil, errors.WithStack(ErrNotFound)
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
type MetadataKey string
|
||||
|
||||
// See https://wiki.openzim.org/wiki/Metadata
|
||||
const (
|
||||
MetadataName MetadataKey = "Name"
|
||||
MetadataTitle MetadataKey = "Title"
|
||||
MetadataDescription MetadataKey = "Description"
|
||||
MetadataLongDescription MetadataKey = "LongDescription"
|
||||
MetadataCreator MetadataKey = "Creator"
|
||||
MetadataTags MetadataKey = "Tags"
|
||||
MetadataDate MetadataKey = "Date"
|
||||
MetadataPublisher MetadataKey = "Publisher"
|
||||
MetadataFlavour MetadataKey = "Flavour"
|
||||
MetadataSource MetadataKey = "Source"
|
||||
MetadataLanguage MetadataKey = "Language"
|
||||
MetadataIllustration48x48at1 MetadataKey = "Illustration_48x48@1"
|
||||
MetadataIllustration96x96at2 MetadataKey = "Illustration_96x96@2"
|
||||
)
|
||||
|
||||
var knownKeys = []MetadataKey{
|
||||
MetadataName,
|
||||
MetadataTitle,
|
||||
MetadataDescription,
|
||||
MetadataLongDescription,
|
||||
MetadataCreator,
|
||||
MetadataPublisher,
|
||||
MetadataLanguage,
|
||||
MetadataTags,
|
||||
MetadataDate,
|
||||
MetadataFlavour,
|
||||
MetadataSource,
|
||||
MetadataIllustration48x48at1,
|
||||
MetadataIllustration96x96at2,
|
||||
}
|
||||
|
||||
// Metadata returns a copy of the internal metadata map of the ZIM file.
|
||||
func (z *ZimReader) Metadata(keys ...MetadataKey) (map[MetadataKey]string, error) {
|
||||
if len(keys) == 0 {
|
||||
keys = knownKeys
|
||||
}
|
||||
|
||||
metadata := make(map[MetadataKey]string)
|
||||
|
||||
for _, key := range keys {
|
||||
article, err := z.GetPageNoIndex("M/" + string(key))
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
continue
|
||||
}
|
||||
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
data, err := article.Data()
|
||||
if errors.Is(err, ErrNotFound) {
|
||||
continue
|
||||
}
|
||||
|
||||
metadata[key] = string(data)
|
||||
}
|
||||
|
||||
return metadata, nil
|
||||
}
|
Binary file not shown.
|
@ -0,0 +1,43 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
// read a little endian uint64
|
||||
func readInt64(b []byte, err error) (v uint64, aerr error) {
|
||||
if err != nil {
|
||||
aerr = err
|
||||
|
||||
return
|
||||
}
|
||||
buf := bytes.NewBuffer(b)
|
||||
aerr = binary.Read(buf, binary.LittleEndian, &v)
|
||||
return
|
||||
}
|
||||
|
||||
// read a little endian uint32
|
||||
func readInt32(b []byte, err error) (v uint32, aerr error) {
|
||||
if err != nil {
|
||||
aerr = err
|
||||
return
|
||||
}
|
||||
buf := bytes.NewBuffer(b)
|
||||
aerr = binary.Read(buf, binary.LittleEndian, &v)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// read a little endian uint32
|
||||
func readInt16(b []byte, err error) (v uint16, aerr error) {
|
||||
if err != nil {
|
||||
aerr = err
|
||||
|
||||
return
|
||||
}
|
||||
buf := bytes.NewBuffer(b)
|
||||
aerr = binary.Read(buf, binary.LittleEndian, &v)
|
||||
|
||||
return
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"github.com/ulikunitz/xz"
|
||||
)
|
||||
|
||||
type XZReader struct {
|
||||
*xz.Reader
|
||||
}
|
||||
|
||||
func NewXZReader(r io.Reader) (*XZReader, error) {
|
||||
dec, err := xz.NewReader(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &XZReader{dec}, nil
|
||||
}
|
||||
|
||||
func (xr *XZReader) Close() error {
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,317 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
const (
|
||||
zimHeader = 72173914
|
||||
)
|
||||
|
||||
// ZimReader keep tracks of everything related to ZIM reading
|
||||
type ZimReader struct {
|
||||
f *os.File
|
||||
UUID uint32
|
||||
ArticleCount uint32
|
||||
clusterCount uint32
|
||||
urlPtrPos uint64
|
||||
titlePtrPos uint64
|
||||
clusterPtrPos uint64
|
||||
mimeListPos uint64
|
||||
mainPage uint32
|
||||
layoutPage uint32
|
||||
mimeTypeList []string
|
||||
}
|
||||
|
||||
// create a new zim reader
|
||||
func NewReader(path string) (*ZimReader, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
z := ZimReader{f: f, mainPage: 0xffffffff, layoutPage: 0xffffffff}
|
||||
|
||||
articlePool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
return new(Article)
|
||||
},
|
||||
}
|
||||
// keep 4 latest uncompressed blobs, around 1M per blob
|
||||
bcache, _ = lru.New[any, any](5)
|
||||
|
||||
err = z.readFileHeaders()
|
||||
return &z, err
|
||||
}
|
||||
|
||||
// Return an ordered list of mime types present in the ZIM file
|
||||
func (z *ZimReader) MimeTypes() []string {
|
||||
if len(z.mimeTypeList) != 0 {
|
||||
return z.mimeTypeList
|
||||
}
|
||||
|
||||
var s []string
|
||||
// assume mime list fit in 2k
|
||||
b, err := z.bytesRangeAt(z.mimeListPos, z.mimeListPos+2048)
|
||||
if err != nil {
|
||||
return s
|
||||
}
|
||||
bbuf := bytes.NewBuffer(b)
|
||||
|
||||
for {
|
||||
line, err := bbuf.ReadBytes('\x00')
|
||||
if err != nil && err != io.EOF {
|
||||
return s
|
||||
}
|
||||
// a line of 1 is a line containing only \x00 and it's the marker for the
|
||||
// end of mime types list
|
||||
if len(line) == 1 {
|
||||
break
|
||||
}
|
||||
s = append(s, strings.TrimRight(string(line), "\x00"))
|
||||
}
|
||||
z.mimeTypeList = s
|
||||
return s
|
||||
}
|
||||
|
||||
// list all articles, using url index, contained in a zim file
|
||||
// note that this is a slow implementation, a real iterator is faster
|
||||
// you are not suppose to use this method on big zim files, use indexes
|
||||
func (z *ZimReader) ListArticles() <-chan *Article {
|
||||
ch := make(chan *Article, 10)
|
||||
|
||||
go func() {
|
||||
var idx uint32
|
||||
// starting at 1 to avoid "con" entry
|
||||
var start uint32 = 1
|
||||
|
||||
for idx = start; idx < z.ArticleCount; idx++ {
|
||||
art, err := z.ArticleAtURLIdx(idx)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if art == nil {
|
||||
// TODO: deal with redirect continue
|
||||
}
|
||||
ch <- art
|
||||
}
|
||||
close(ch)
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
// list all title pointer, Titles by position contained in a zim file
|
||||
// Titles are pointers to URLpos index, useful for indexing cause smaller to store: uint32
|
||||
// note that this is a slow implementation, a real iterator is faster
|
||||
// you are not suppose to use this method on big zim files prefer ListTitlesPtrIterator to build your index
|
||||
func (z *ZimReader) ListTitlesPtr() <-chan uint32 {
|
||||
ch := make(chan uint32, 10)
|
||||
|
||||
go func() {
|
||||
var pos uint64
|
||||
var count uint32
|
||||
|
||||
for pos = z.titlePtrPos; count < z.ArticleCount; pos += 4 {
|
||||
idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
ch <- idx
|
||||
count++
|
||||
}
|
||||
close(ch)
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
// list all title pointer, Titles by position contained in a zim file
|
||||
// Titles are pointers to URLpos index, usefull for indexing cause smaller to store: uint32
|
||||
func (z *ZimReader) ListTitlesPtrIterator(cb func(uint32)) {
|
||||
var count uint32
|
||||
for pos := z.titlePtrPos; count < z.ArticleCount; pos += 4 {
|
||||
idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cb(idx)
|
||||
count++
|
||||
}
|
||||
}
|
||||
|
||||
// return the article at the exact url not using any index
|
||||
func (z *ZimReader) GetPageNoIndex(url string) (*Article, error) {
|
||||
// starting at 1 to avoid "con" entry
|
||||
var start uint32
|
||||
stop := z.ArticleCount
|
||||
|
||||
a := new(Article)
|
||||
|
||||
for {
|
||||
pos := (start + stop) / 2
|
||||
|
||||
offset, err := z.OffsetAtURLIdx(pos)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
err = z.FillArticleAt(a, offset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if a.FullURL() == url {
|
||||
return a, nil
|
||||
}
|
||||
|
||||
if a.FullURL() > url {
|
||||
stop = pos
|
||||
} else {
|
||||
start = pos
|
||||
}
|
||||
if stop-start == 1 {
|
||||
break
|
||||
}
|
||||
|
||||
}
|
||||
return nil, errors.WithStack(ErrNotFound)
|
||||
}
|
||||
|
||||
// get the offset pointing to Article at pos in the URL idx
|
||||
func (z *ZimReader) OffsetAtURLIdx(idx uint32) (uint64, error) {
|
||||
offset := z.urlPtrPos + uint64(idx)*8
|
||||
return readInt64(z.bytesRangeAt(offset, offset+8))
|
||||
}
|
||||
|
||||
// Close & cleanup the zimreader
|
||||
func (z *ZimReader) Close() error {
|
||||
return z.f.Close()
|
||||
}
|
||||
|
||||
func (z *ZimReader) String() string {
|
||||
fi, err := z.f.Stat()
|
||||
if err != nil {
|
||||
return "corrupted zim"
|
||||
}
|
||||
return fmt.Sprintf("Size: %d, ArticleCount: %d urlPtrPos: 0x%x titlePtrPos: 0x%x mimeListPos: 0x%x clusterPtrPos: 0x%x\nMimeTypes: %v",
|
||||
fi.Size(), z.ArticleCount, z.urlPtrPos, z.titlePtrPos, z.mimeListPos, z.clusterPtrPos, z.MimeTypes())
|
||||
}
|
||||
|
||||
// getBytesRangeAt returns bytes from start to end
|
||||
// it's needed to abstract mmap usages rather than read directly on the mmap slices
|
||||
func (z *ZimReader) bytesRangeAt(start, end uint64) ([]byte, error) {
|
||||
buf := make([]byte, end-start)
|
||||
n, err := z.f.ReadAt(buf, int64(start))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("can't read bytes %w", err)
|
||||
}
|
||||
|
||||
if n != int(end-start) {
|
||||
return nil, errors.New("can't read enough bytes")
|
||||
}
|
||||
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
// populate the ZimReader structs with headers
|
||||
func (z *ZimReader) readFileHeaders() error {
|
||||
// checking for file type
|
||||
v, err := readInt32(z.bytesRangeAt(0, 0+4))
|
||||
if err != nil || v != zimHeader {
|
||||
return errors.New("not a ZIM file")
|
||||
}
|
||||
|
||||
// checking for version
|
||||
v, err = readInt32(z.bytesRangeAt(4, 4+4))
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "could not read file version")
|
||||
}
|
||||
|
||||
// checking for articles count
|
||||
v, err = readInt32(z.bytesRangeAt(8, 16))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.UUID = v
|
||||
|
||||
// checking for articles count
|
||||
v, err = readInt32(z.bytesRangeAt(24, 24+4))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.ArticleCount = v
|
||||
|
||||
// checking for cluster count
|
||||
v, err = readInt32(z.bytesRangeAt(28, 28+4))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.clusterCount = v
|
||||
|
||||
// checking for urlPtrPos
|
||||
vb, err := readInt64(z.bytesRangeAt(32, 32+8))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.urlPtrPos = vb
|
||||
|
||||
// checking for titlePtrPos
|
||||
vb, err = readInt64(z.bytesRangeAt(40, 40+8))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.titlePtrPos = vb
|
||||
|
||||
// checking for clusterPtrPos
|
||||
vb, err = readInt64(z.bytesRangeAt(48, 48+8))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.clusterPtrPos = vb
|
||||
|
||||
// checking for mimeListPos
|
||||
vb, err = readInt64(z.bytesRangeAt(56, 56+8))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.mimeListPos = vb
|
||||
|
||||
// checking for mainPage
|
||||
v, err = readInt32(z.bytesRangeAt(64, 64+4))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.mainPage = v
|
||||
|
||||
// checking for layoutPage
|
||||
v, err = readInt32(z.bytesRangeAt(68, 68+4))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
z.layoutPage = v
|
||||
|
||||
z.MimeTypes()
|
||||
return nil
|
||||
}
|
||||
|
||||
// return start and end offsets for cluster at index idx
|
||||
func (z *ZimReader) clusterOffsetsAtIdx(idx uint32) (start, end uint64, err error) {
|
||||
offset := z.clusterPtrPos + (uint64(idx) * 8)
|
||||
start, err = readInt64(z.bytesRangeAt(offset, offset+8))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
offset = z.clusterPtrPos + (uint64(idx+1) * 8)
|
||||
end, err = readInt64(z.bytesRangeAt(offset, offset+8))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
end--
|
||||
return
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"log"
|
||||
"testing"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
)
|
||||
|
||||
var Z *ZimReader
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
Z, err = NewReader("testdata/wikibooks_af_all_maxi_2023-06.zim")
|
||||
if err != nil {
|
||||
log.Panicf("Can't read %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOpen(t *testing.T) {
|
||||
if Z.ArticleCount == 0 {
|
||||
t.Errorf("No article found")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMime(t *testing.T) {
|
||||
if len(Z.MimeTypes()) == 0 {
|
||||
t.Errorf("No mime types found")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDisplayInfost(t *testing.T) {
|
||||
info := Z.String()
|
||||
if len(info) < 0 {
|
||||
t.Errorf("Can't read infos")
|
||||
}
|
||||
t.Log(info)
|
||||
}
|
||||
|
||||
func TestURLAtIdx(t *testing.T) {
|
||||
// addr 0 is a redirect
|
||||
p, _ := Z.OffsetAtURLIdx(5)
|
||||
a, _ := Z.ArticleAt(p)
|
||||
if a == nil {
|
||||
t.Errorf("Can't find 1st url")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDisplayArticle(t *testing.T) {
|
||||
// addr 0 is a redirect
|
||||
p, _ := Z.OffsetAtURLIdx(5)
|
||||
a, _ := Z.ArticleAt(p)
|
||||
if a == nil {
|
||||
t.Errorf("Can't find 1st url")
|
||||
}
|
||||
|
||||
t.Log(a)
|
||||
}
|
||||
|
||||
func TestPageNoIndex(t *testing.T) {
|
||||
a, _ := Z.GetPageNoIndex("A/Dracula:Capitol_1.html")
|
||||
if a == nil {
|
||||
t.Errorf("Can't find existing url")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListArticles(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping test in short mode.")
|
||||
}
|
||||
|
||||
var i uint32
|
||||
|
||||
for a := range Z.ListArticles() {
|
||||
i++
|
||||
t.Log(a.String())
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
t.Errorf("Can't find any urls")
|
||||
}
|
||||
|
||||
if i != Z.ArticleCount-1 {
|
||||
t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, Z.ArticleCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainPage(t *testing.T) {
|
||||
a, _ := Z.MainPage()
|
||||
if a == nil {
|
||||
t.Errorf("Can't find the mainpage article")
|
||||
}
|
||||
|
||||
t.Log(a)
|
||||
}
|
||||
|
||||
func TestFavicon(t *testing.T) {
|
||||
favicon, err := Z.Favicon()
|
||||
if err != nil {
|
||||
t.Errorf("%+v", errors.WithStack(err))
|
||||
}
|
||||
if favicon == nil {
|
||||
t.Errorf("Can't find the favicon article")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetadata(t *testing.T) {
|
||||
metadata, err := Z.Metadata()
|
||||
if err != nil {
|
||||
t.Errorf("%+v", errors.WithStack(err))
|
||||
}
|
||||
if metadata == nil {
|
||||
t.Errorf("Can't find the metadata")
|
||||
}
|
||||
}
|
||||
|
||||
func TestData(t *testing.T) {
|
||||
// addr 0 is a redirect
|
||||
p, _ := Z.OffsetAtURLIdx(2)
|
||||
a, _ := Z.ArticleAt(p)
|
||||
b, _ := a.Data()
|
||||
data := string(b)
|
||||
if a.EntryType != RedirectEntry {
|
||||
if len(data) == 0 {
|
||||
t.Error("can't read data")
|
||||
}
|
||||
}
|
||||
t.Log(a.String())
|
||||
t.Log(data)
|
||||
}
|
||||
|
||||
func BenchmarkArticleBytes(b *testing.B) {
|
||||
// addr 0 is a redirect
|
||||
p, _ := Z.OffsetAtURLIdx(5)
|
||||
a, _ := Z.ArticleAt(p)
|
||||
if a == nil {
|
||||
b.Errorf("Can't find 1st url")
|
||||
}
|
||||
data, err := a.Data()
|
||||
if err != nil {
|
||||
b.Error(err)
|
||||
}
|
||||
|
||||
b.SetBytes(int64(len(data)))
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
a.Data()
|
||||
bcache.Purge() // prevent memiozing value
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
package zim
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/klauspost/compress/zstd"
|
||||
)
|
||||
|
||||
type ZstdReader struct {
|
||||
*zstd.Decoder
|
||||
}
|
||||
|
||||
func NewZstdReader(r io.Reader) (*ZstdReader, error) {
|
||||
dec, err := zstd.NewReader(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("can't read from zstd %w", err)
|
||||
}
|
||||
return &ZstdReader{dec}, nil
|
||||
}
|
||||
|
||||
func (zr *ZstdReader) Close() error {
|
||||
zr.Decoder.Close()
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,447 @@
|
|||
package bundle
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"forge.cadoles.com/arcad/edge/pkg/bundle/zim"
|
||||
"github.com/pkg/errors"
|
||||
"gitlab.com/wpetit/goweb/logger"
|
||||
"gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
type ZimBundle struct {
|
||||
archivePath string
|
||||
}
|
||||
|
||||
func (b *ZimBundle) File(filename string) (io.ReadCloser, os.FileInfo, error) {
|
||||
ctx := logger.With(
|
||||
context.Background(),
|
||||
logger.F("filename", filename),
|
||||
)
|
||||
|
||||
logger.Debug(ctx, "opening file")
|
||||
|
||||
switch filename {
|
||||
case "manifest.yml":
|
||||
return b.renderFakeManifest(ctx)
|
||||
case "server/main.js":
|
||||
return b.renderFakeServerMain(ctx)
|
||||
case "public":
|
||||
return b.renderDirectory(ctx, filename)
|
||||
case "public/index.html":
|
||||
return b.redirectToMainPage(ctx, filename)
|
||||
|
||||
default:
|
||||
return b.renderURL(ctx, filename)
|
||||
}
|
||||
}
|
||||
|
||||
func (b *ZimBundle) Dir(dirname string) ([]os.FileInfo, error) {
|
||||
reader, err := b.openArchive()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := reader.Close(); err != nil {
|
||||
panic(errors.WithStack(err))
|
||||
}
|
||||
}()
|
||||
|
||||
files := make([]os.FileInfo, 0)
|
||||
// ctx := context.Background()
|
||||
|
||||
// for _, f := range reader.File {
|
||||
// if !strings.HasPrefix(f.Name, dirname) {
|
||||
// continue
|
||||
// }
|
||||
|
||||
// relPath, err := filepath.Rel(dirname, f.Name)
|
||||
// if err != nil {
|
||||
// return nil, errors.Wrap(err, "could not get relative path")
|
||||
// }
|
||||
|
||||
// logger.Debug(
|
||||
// ctx, "checking file prefix",
|
||||
// logger.F("dirname", dirname),
|
||||
// logger.F("filename", f.Name),
|
||||
// logger.F("relpath", relPath),
|
||||
// )
|
||||
|
||||
// if relPath == filepath.Base(f.Name) {
|
||||
// files = append(files, f.FileInfo())
|
||||
// }
|
||||
// }
|
||||
|
||||
return files, nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) renderFakeManifest(ctx context.Context) (io.ReadCloser, os.FileInfo, error) {
|
||||
reader, err := b.openArchive()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := reader.Close(); err != nil {
|
||||
panic(errors.WithStack(err))
|
||||
}
|
||||
}()
|
||||
|
||||
metadata, err := reader.Metadata()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
manifest := map[string]any{}
|
||||
|
||||
manifest["version"] = "0.0.0"
|
||||
|
||||
if name, exists := metadata[zim.MetadataName]; exists {
|
||||
replacer := strings.NewReplacer(
|
||||
"_", "",
|
||||
" ", "",
|
||||
)
|
||||
|
||||
manifest["id"] = strings.ToLower(replacer.Replace(name)) + ".zim.edge.app"
|
||||
} else {
|
||||
manifest["id"] = strconv.FormatUint(uint64(reader.UUID), 10) + ".zim.edge.app"
|
||||
}
|
||||
|
||||
if title, exists := metadata[zim.MetadataTitle]; exists {
|
||||
manifest["title"] = title
|
||||
} else {
|
||||
manifest["title"] = "Unknown"
|
||||
}
|
||||
|
||||
if description, exists := metadata[zim.MetadataDescription]; exists {
|
||||
manifest["description"] = description
|
||||
}
|
||||
|
||||
favicon, err := reader.Favicon()
|
||||
if err != nil && !errors.Is(err, zim.ErrNotFound) {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
if favicon != nil {
|
||||
manifestMeta, exists := manifest["metadata"].(map[string]any)
|
||||
if !exists {
|
||||
manifestMeta = make(map[string]any)
|
||||
manifest["metadata"] = manifestMeta
|
||||
}
|
||||
|
||||
paths, exists := manifestMeta["paths"].(map[string]any)
|
||||
if !exists {
|
||||
paths = make(map[string]any)
|
||||
manifestMeta["paths"] = paths
|
||||
}
|
||||
|
||||
paths["icon"] = "/" + favicon.FullURL()
|
||||
}
|
||||
|
||||
data, err := yaml.Marshal(manifest)
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
stat := &zimFileInfo{
|
||||
isDir: false,
|
||||
modTime: time.Time{},
|
||||
mode: 0,
|
||||
name: "manifest.yml",
|
||||
size: int64(len(data)),
|
||||
}
|
||||
|
||||
buf := bytes.NewBuffer(data)
|
||||
file := ioutil.NopCloser(buf)
|
||||
|
||||
return file, stat, nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) renderFakeServerMain(ctx context.Context) (io.ReadCloser, os.FileInfo, error) {
|
||||
stat := &zimFileInfo{
|
||||
isDir: false,
|
||||
modTime: time.Time{},
|
||||
mode: 0,
|
||||
name: "server/main.js",
|
||||
size: 0,
|
||||
}
|
||||
|
||||
buf := bytes.NewBuffer(nil)
|
||||
file := ioutil.NopCloser(buf)
|
||||
|
||||
return file, stat, nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) renderURL(ctx context.Context, url string) (io.ReadCloser, os.FileInfo, error) {
|
||||
zr, err := b.openArchive()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := zr.Close(); err != nil {
|
||||
panic(errors.WithStack(err))
|
||||
}
|
||||
}()
|
||||
|
||||
filename := filepath.Base(url)
|
||||
url = strings.TrimPrefix(url, "public/")
|
||||
|
||||
article, err := zr.GetPageNoIndex(url)
|
||||
if err != nil {
|
||||
if errors.Is(err, zim.ErrNotFound) {
|
||||
return nil, nil, errors.WithStack(fs.ErrNotExist)
|
||||
}
|
||||
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
if article.EntryType == zim.RedirectEntry {
|
||||
redirectIndex, err := article.RedirectIndex()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
ra, err := zr.ArticleAtURLIdx(redirectIndex)
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
return b.renderRedirect(ctx, filename, ra.FullURL())
|
||||
}
|
||||
|
||||
data, err := article.Data()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
mimeType := article.MimeType()
|
||||
if mimeType == "text/html" {
|
||||
injected, err := b.injectEdgeScriptTag(data)
|
||||
if err != nil {
|
||||
logger.Error(ctx, "could not inject edge script", logger.E(errors.WithStack(err)))
|
||||
} else {
|
||||
data = injected
|
||||
}
|
||||
}
|
||||
|
||||
zimFile := &zimFile{
|
||||
fileInfo: &zimFileInfo{
|
||||
isDir: false,
|
||||
modTime: time.Time{},
|
||||
mode: 0,
|
||||
name: filename,
|
||||
size: int64(len(data)),
|
||||
},
|
||||
buff: bytes.NewBuffer(data),
|
||||
}
|
||||
|
||||
return zimFile, zimFile.fileInfo, nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) renderDirectory(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) {
|
||||
zr, err := b.openArchive()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := zr.Close(); err != nil {
|
||||
panic(errors.WithStack(err))
|
||||
}
|
||||
}()
|
||||
|
||||
zimFile := &zimFile{
|
||||
fileInfo: &zimFileInfo{
|
||||
isDir: true,
|
||||
modTime: time.Time{},
|
||||
mode: 0,
|
||||
name: filename,
|
||||
size: 0,
|
||||
},
|
||||
buff: bytes.NewBuffer(nil),
|
||||
}
|
||||
|
||||
return zimFile, zimFile.fileInfo, nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) renderRedirect(ctx context.Context, filename string, to string) (io.ReadCloser, os.FileInfo, error) {
|
||||
logger.Debug(ctx, "rendering redirect", logger.F("url", to))
|
||||
|
||||
data := fmt.Sprintf(`
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="refresh" content="0; url=/%s" />
|
||||
</head>
|
||||
</html>
|
||||
`, to)
|
||||
|
||||
stat := &zimFileInfo{
|
||||
isDir: false,
|
||||
modTime: time.Time{},
|
||||
mode: 0,
|
||||
name: filename,
|
||||
size: int64(len(data)),
|
||||
}
|
||||
|
||||
buf := bytes.NewBuffer([]byte(data))
|
||||
reader := ioutil.NopCloser(buf)
|
||||
|
||||
return reader, stat, nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) redirectToMainPage(ctx context.Context, filename string) (io.ReadCloser, os.FileInfo, error) {
|
||||
zr, err := b.openArchive()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err := zr.Close(); err != nil {
|
||||
panic(errors.WithStack(err))
|
||||
}
|
||||
}()
|
||||
|
||||
main, err := zr.MainPage()
|
||||
if err != nil {
|
||||
return nil, nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
return b.renderRedirect(ctx, filename, main.FullURL())
|
||||
}
|
||||
|
||||
func (b *ZimBundle) injectEdgeScriptTag(data []byte) ([]byte, error) {
|
||||
buff := bytes.NewBuffer(data)
|
||||
doc, err := html.Parse(buff)
|
||||
if err != nil {
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
var f func(*html.Node) bool
|
||||
f = func(n *html.Node) bool {
|
||||
if n.Type == html.ElementNode && n.Data == "head" {
|
||||
script := &html.Node{
|
||||
Type: html.ElementNode,
|
||||
Data: "script",
|
||||
Attr: []html.Attribute{
|
||||
{
|
||||
Key: "src",
|
||||
Val: "/edge/sdk/client.js",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
n.AppendChild(script)
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
if keepWalking := f(c); !keepWalking {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
f(doc)
|
||||
|
||||
buff.Reset()
|
||||
|
||||
if err := html.Render(buff, doc); err != nil {
|
||||
return nil, errors.WithStack(err)
|
||||
}
|
||||
|
||||
return buff.Bytes(), nil
|
||||
}
|
||||
|
||||
func (b *ZimBundle) openArchive() (*zim.ZimReader, error) {
|
||||
zm, err := zim.NewReader(b.archivePath)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "could not open '%v'", b.archivePath)
|
||||
}
|
||||
|
||||
return zm, nil
|
||||
}
|
||||
|
||||
func NewZimBundle(archivePath string) *ZimBundle {
|
||||
return &ZimBundle{
|
||||
archivePath: archivePath,
|
||||
}
|
||||
}
|
||||
|
||||
type zimFile struct {
|
||||
fileInfo *zimFileInfo
|
||||
buff *bytes.Buffer
|
||||
}
|
||||
|
||||
// Close implements fs.File.
|
||||
func (f *zimFile) Close() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Read implements fs.File.
|
||||
func (f *zimFile) Read(d []byte) (int, error) {
|
||||
return f.buff.Read(d)
|
||||
}
|
||||
|
||||
// Stat implements fs.File.
|
||||
func (f *zimFile) Stat() (fs.FileInfo, error) {
|
||||
return f.fileInfo, nil
|
||||
}
|
||||
|
||||
var _ fs.File = &zimFile{}
|
||||
|
||||
type zimFileInfo struct {
|
||||
isDir bool
|
||||
modTime time.Time
|
||||
mode fs.FileMode
|
||||
name string
|
||||
size int64
|
||||
}
|
||||
|
||||
// IsDir implements fs.FileInfo.
|
||||
func (i *zimFileInfo) IsDir() bool {
|
||||
return i.isDir
|
||||
}
|
||||
|
||||
// ModTime implements fs.FileInfo.
|
||||
func (i *zimFileInfo) ModTime() time.Time {
|
||||
return i.modTime
|
||||
}
|
||||
|
||||
// Mode implements fs.FileInfo.
|
||||
func (i *zimFileInfo) Mode() fs.FileMode {
|
||||
return i.mode
|
||||
}
|
||||
|
||||
// Name implements fs.FileInfo.
|
||||
func (i *zimFileInfo) Name() string {
|
||||
return i.name
|
||||
}
|
||||
|
||||
// Size implements fs.FileInfo.
|
||||
func (i *zimFileInfo) Size() int64 {
|
||||
return i.size
|
||||
}
|
||||
|
||||
// Sys implements fs.FileInfo.
|
||||
func (*zimFileInfo) Sys() any {
|
||||
return nil
|
||||
}
|
||||
|
||||
var _ fs.FileInfo = &zimFileInfo{}
|
Loading…
Reference in New Issue