diff --git a/go.mod b/go.mod index 8bd6f31..95b025a 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module forge.cadoles.com/arcad/edge -go 1.20 +go 1.21 require ( github.com/hashicorp/golang-lru/v2 v2.0.7 diff --git a/pkg/bundle/zim/article.go b/pkg/bundle/oldzim/article.go similarity index 100% rename from pkg/bundle/zim/article.go rename to pkg/bundle/oldzim/article.go diff --git a/pkg/bundle/oldzim/error.go b/pkg/bundle/oldzim/error.go new file mode 100644 index 0000000..92bbbe2 --- /dev/null +++ b/pkg/bundle/oldzim/error.go @@ -0,0 +1,7 @@ +package zim + +import "errors" + +var ( + ErrNotFound = errors.New("not found") +) diff --git a/pkg/bundle/zim/favicon.go b/pkg/bundle/oldzim/favicon.go similarity index 100% rename from pkg/bundle/zim/favicon.go rename to pkg/bundle/oldzim/favicon.go diff --git a/pkg/bundle/zim/metadata.go b/pkg/bundle/oldzim/metadata.go similarity index 100% rename from pkg/bundle/zim/metadata.go rename to pkg/bundle/oldzim/metadata.go diff --git a/pkg/bundle/zim/tools.go b/pkg/bundle/oldzim/tools.go similarity index 100% rename from pkg/bundle/zim/tools.go rename to pkg/bundle/oldzim/tools.go diff --git a/pkg/bundle/zim/xz_reader.go b/pkg/bundle/oldzim/xz_reader.go similarity index 100% rename from pkg/bundle/zim/xz_reader.go rename to pkg/bundle/oldzim/xz_reader.go diff --git a/pkg/bundle/zim/zim.go b/pkg/bundle/oldzim/zim.go similarity index 99% rename from pkg/bundle/zim/zim.go rename to pkg/bundle/oldzim/zim.go index 8d51510..f4bae69 100644 --- a/pkg/bundle/zim/zim.go +++ b/pkg/bundle/oldzim/zim.go @@ -8,6 +8,7 @@ import ( "strings" "sync" + "github.com/davecgh/go-spew/spew" lru "github.com/hashicorp/golang-lru/v2" "github.com/pkg/errors" ) @@ -100,6 +101,7 @@ func (z *ZimReader) ListArticles() <-chan *Article { if art == nil { // TODO: deal with redirect continue + continue } ch <- art } @@ -296,6 +298,8 @@ func (z *ZimReader) readFileHeaders() error { } z.layoutPage = v + spew.Dump(z) + z.MimeTypes() return nil } diff --git a/pkg/bundle/oldzim/zim_test.go b/pkg/bundle/oldzim/zim_test.go new file mode 100644 index 0000000..6e7724d --- /dev/null +++ b/pkg/bundle/oldzim/zim_test.go @@ -0,0 +1,153 @@ +package zim + +import ( + "path/filepath" + "reflect" + "runtime" + "testing" + + "github.com/pkg/errors" +) + +var testCases = []func(t *testing.T, z *ZimReader){ + testOpen, + testData, + testDisplayArticle, + testDisplayInfost, + testFavicon, + testListArticles, + testMainPage, + testMetadata, + testMime, + testURLAtIdx, +} + +func TestZim(t *testing.T) { + zimFiles, err := filepath.Glob("testdata/*.zim") + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + for _, zf := range zimFiles { + zr, err := NewReader(zf) + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + base := filepath.Base(zf) + + t.Run(base, func(t *testing.T) { + for _, fn := range testCases { + testName := runtime.FuncForPC(reflect.ValueOf(fn).Pointer()).Name() + t.Run(testName, func(t *testing.T) { + fn(t, zr) + }) + } + }) + } +} + +func testOpen(t *testing.T, zr *ZimReader) { + if zr.ArticleCount == 0 { + t.Errorf("No article found") + } +} + +func testMime(t *testing.T, zr *ZimReader) { + if len(zr.MimeTypes()) == 0 { + t.Errorf("No mime types found") + } +} + +func testDisplayInfost(t *testing.T, zr *ZimReader) { + info := zr.String() + if len(info) < 0 { + t.Errorf("Can't read infos") + } + t.Log(info) +} + +func testURLAtIdx(t *testing.T, zr *ZimReader) { + // addr 0 is a redirect + p, _ := zr.OffsetAtURLIdx(5) + a, _ := zr.ArticleAt(p) + if a == nil { + t.Errorf("Can't find 1st url") + } +} + +func testDisplayArticle(t *testing.T, zr *ZimReader) { + // addr 0 is a redirect + p, _ := zr.OffsetAtURLIdx(5) + a, _ := zr.ArticleAt(p) + if a == nil { + t.Errorf("Can't find 1st url") + } + + t.Log(a) +} + +func testListArticles(t *testing.T, zr *ZimReader) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + var i uint32 + + for a := range zr.ListArticles() { + i++ + t.Log(a.String()) + } + + if i == 0 { + t.Errorf("Can't find any urls") + } + + if i != zr.ArticleCount-1 { + t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, zr.ArticleCount) + } +} + +func testMainPage(t *testing.T, zr *ZimReader) { + a, _ := zr.MainPage() + if a == nil { + t.Errorf("Can't find the mainpage article") + } + + t.Log(a) +} + +func testFavicon(t *testing.T, zr *ZimReader) { + favicon, err := zr.Favicon() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + if favicon == nil { + t.Errorf("Can't find the favicon article") + } +} + +func testMetadata(t *testing.T, zr *ZimReader) { + metadata, err := zr.Metadata() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + if metadata == nil { + t.Errorf("Can't find the metadata") + } +} + +func testData(t *testing.T, zr *ZimReader) { + // addr 0 is a redirect + p, _ := zr.OffsetAtURLIdx(2) + a, _ := zr.ArticleAt(p) + b, _ := a.Data() + data := string(b) + if a.EntryType != RedirectEntry { + if len(data) == 0 { + t.Error("can't read data") + } + } + t.Log(a.String()) + t.Log(data) +} diff --git a/pkg/bundle/zim/zstd_reader.go b/pkg/bundle/oldzim/zstd_reader.go similarity index 100% rename from pkg/bundle/zim/zstd_reader.go rename to pkg/bundle/oldzim/zstd_reader.go diff --git a/pkg/bundle/zim/content_entry.go b/pkg/bundle/zim/content_entry.go new file mode 100644 index 0000000..bae88d1 --- /dev/null +++ b/pkg/bundle/zim/content_entry.go @@ -0,0 +1,233 @@ +package zim + +import ( + "bytes" + "encoding/binary" + "io" + "log" + + "github.com/pkg/errors" +) + +type zimCompression uint8 + +const ( + zimCompressionNoneZeno zimCompression = 0 + zimCompressionNone zimCompression = 1 + zimCompressionNoneZLib zimCompression = 2 + zimCompressionNoneBZip2 zimCompression = 3 + zimCompressionNoneXZ zimCompression = 4 + zimCompressionNoneZStandard zimCompression = 5 +) + +type ContentEntry struct { + *BaseEntry + mimeType string + clusterIndex uint32 + blobIndex uint32 +} + +func (e *ContentEntry) Reader() (io.Reader, error) { + data := make([]byte, 8) + + startClusterPtrOffset := e.reader.clusterPtrPos + (uint64(e.clusterIndex) * 8) + if err := e.reader.readRange(int64(startClusterPtrOffset), data); err != nil { + return nil, errors.WithStack(err) + } + + startClusterOffset, err := readUint64(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + endClusterPtrOffset := e.reader.clusterPtrPos + (uint64(e.clusterIndex+1) * 8) + if err := e.reader.readRange(int64(endClusterPtrOffset), data); err != nil { + return nil, errors.WithStack(err) + } + + endClusterOffset, err := readUint64(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + data = make([]byte, 1) + if err := e.reader.readRange(int64(startClusterPtrOffset), data); err != nil { + return nil, errors.WithStack(err) + } + + clusterHeader := uint8(data[0]) + + compression := (clusterHeader << 4) >> 4 + extended := (clusterHeader<<3)>>7 == 1 + + log.Printf("%08b %v %04b %d %d %d", clusterHeader, extended, compression, compression, startClusterOffset, endClusterOffset) + + switch compression { + case uint8(zimCompressionNoneZeno): + fallthrough + case uint8(zimCompressionNone): + + case uint8(zimCompressionNoneXZ): + + case uint8(zimCompressionNoneZStandard): + + case uint8(zimCompressionNoneZLib): + fallthrough + case uint8(zimCompressionNoneBZip2): + fallthrough + default: + // return nil, errors.Wrapf(ErrCompressionAlgorithmNotSupported, "unexpected compression algorithm '%d'", compression) + + } + + var internal []byte + buff := bytes.NewBuffer(internal) + + // blob starts at offset, blob ends at offset + // var bs, be uint32 + + // // LZMA: 4, Zstandard: 5 + // if compression == 4 || compression == 5 { + // var blob []byte + // var ok bool + // var dec io.ReadCloser + // if blob, ok = blobLookup(); !ok { + // b, err := a.z.bytesRangeAt(start+1, end+1) + // if err != nil { + // return nil, err + // } + // bbuf := bytes.NewBuffer(b) + // switch compression { + // case 5: + // dec, err = NewZstdReader(bbuf) + + // case 4: + // dec, err = NewXZReader(bbuf) + // } + // if err != nil { + // return nil, err + // } + // defer dec.Close() + // // the decoded chunk are around 1MB + // b, err = ioutil.ReadAll(dec) + // if err != nil { + // return nil, err + // } + // blob = make([]byte, len(b)) + // copy(blob, b) + // // TODO: 2 requests for the same blob could occure at the same time + // bcache.Add(a.cluster, blob) + // } else { + // bi, ok := bcache.Get(a.cluster) + // if !ok { + // return nil, errors.New("not in cache anymore") + // } + // blob = bi.([]byte) + // } + + // bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil) + // if err != nil { + // return nil, err + // } + // be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil) + // if err != nil { + // return nil, err + // } + + // // avoid retaining all the chunk + // c := make([]byte, be-bs) + // copy(c, blob[bs:be]) + // return c, nil + + // } else if compression == 0 || compression == 1 { + // // uncompresssed + // startPos := start + 1 + // blobOffset := uint64(a.blob * 4) + + // bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4)) + // if err != nil { + // return nil, err + // } + + // be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4)) + // if err != nil { + // return nil, err + // } + + // return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be)) + // } + + return buff, nil +} + +func (e *ContentEntry) Redirect() (*ContentEntry, error) { + return e, nil +} + +func (r *Reader) parseContentEntry(offset int64, base *BaseEntry) (*ContentEntry, error) { + entry := &ContentEntry{ + BaseEntry: base, + } + + data := make([]byte, 2) + if err := r.readRange(offset, data); err != nil { + return nil, errors.WithStack(err) + } + + mimeTypeIndex, err := readUint16(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + if mimeTypeIndex >= uint16(len(r.mimeTypes)) { + return nil, errors.Errorf("mime type index '%d' greater than mime types length '%d'", mimeTypeIndex, len(r.mimeTypes)) + } + + entry.mimeType = r.mimeTypes[mimeTypeIndex] + + data = make([]byte, 1) + if err := r.readRange(offset+3, data); err != nil { + return nil, errors.WithStack(err) + } + + entry.namespace = Namespace(data[0]) + + data = make([]byte, 4) + if err := r.readRange(offset+8, data); err != nil { + return nil, errors.WithStack(err) + } + + clusterIndex, err := readUint32(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.clusterIndex = clusterIndex + + if err := r.readRange(offset+12, data); err != nil { + return nil, errors.WithStack(err) + } + + blobIndex, err := readUint32(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.blobIndex = blobIndex + + url, read, err := r.readStringAt(offset + 16) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.url = url + + title, _, err := r.readStringAt(offset + 16 + read) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.title = title + + return entry, nil +} diff --git a/pkg/bundle/zim/entry.go b/pkg/bundle/zim/entry.go new file mode 100644 index 0000000..a9abbff --- /dev/null +++ b/pkg/bundle/zim/entry.go @@ -0,0 +1,150 @@ +package zim + +import ( + "encoding/binary" + + "github.com/pkg/errors" +) + +type Namespace string + +const ( + V6NamespaceContent = "C" + V6NamespaceMetadata = "M" + V6NamespaceWellKnown = "W" + V6NamespaceSearch = "X" +) + +const ( + V5NamespaceLayout = "-" + V5NamespaceArticle = "A" + V5NamespaceArticleMetadata = "B" + V5NamespaceImageFile = "I" + V5NamespaceImageText = "J" + V5NamespaceMetadata = "M" + V5NamespaceCategoryText = "U" + V5NamespaceCategoryArticleList = "V" + V5NamespaceCategoryPerArticle = "W" + V5NamespaceSearch = "X" +) + +type Entry interface { + Redirect() (*ContentEntry, error) + Namespace() Namespace + URL() string + Title() string +} + +type BaseEntry struct { + mimeTypeIndex uint16 + namespace Namespace + url string + title string + reader *Reader +} + +func (e *BaseEntry) Namespace() Namespace { + return e.namespace +} + +func (e *BaseEntry) Title() string { + if e.title == "" { + return e.url + } + + return e.title +} + +func (e *BaseEntry) URL() string { + return e.url +} + +func (r *Reader) parseBaseEntry(offset int64) (*BaseEntry, error) { + entry := &BaseEntry{ + reader: r, + } + + data := make([]byte, 2) + if err := r.readRange(offset, data); err != nil { + return nil, errors.WithStack(err) + } + + mimeTypeIndex, err := readUint16(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.mimeTypeIndex = mimeTypeIndex + + data = make([]byte, 1) + if err := r.readRange(offset+3, data); err != nil { + return nil, errors.WithStack(err) + } + + entry.namespace = Namespace(data[0]) + + return entry, nil +} + +type RedirectEntry struct { + *BaseEntry + redirectIndex uint32 +} + +func (e *RedirectEntry) Redirect() (*ContentEntry, error) { + if e.redirectIndex >= uint32(len(e.reader.urlIndex)) { + return nil, errors.Wrapf(ErrInvalidEntryIndex, "entry index '%d' out of bounds", e.redirectIndex) + } + + entryPtr := e.reader.urlIndex[e.redirectIndex] + entry, err := e.reader.parseEntryAt(int64(entryPtr)) + if err != nil { + return nil, errors.WithStack(err) + } + + entry, err = entry.Redirect() + if err != nil { + return nil, errors.WithStack(err) + } + + contentEntry, ok := entry.(*ContentEntry) + if !ok { + return nil, errors.WithStack(ErrInvalidRedirect) + } + + return contentEntry, nil +} + +func (r *Reader) parseRedirectEntry(offset int64, base *BaseEntry) (*RedirectEntry, error) { + entry := &RedirectEntry{ + BaseEntry: base, + } + + data := make([]byte, 4) + if err := r.readRange(offset+8, data); err != nil { + return nil, errors.WithStack(err) + } + + redirectIndex, err := readUint32(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.redirectIndex = redirectIndex + + url, read, err := r.readStringAt(offset + 12) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.url = url + + title, _, err := r.readStringAt(offset + 12 + read) + if err != nil { + return nil, errors.WithStack(err) + } + + entry.title = title + + return entry, nil +} diff --git a/pkg/bundle/zim/entry_iterator.go b/pkg/bundle/zim/entry_iterator.go new file mode 100644 index 0000000..68e93b1 --- /dev/null +++ b/pkg/bundle/zim/entry_iterator.go @@ -0,0 +1,46 @@ +package zim + +import "github.com/pkg/errors" + +type EntryIterator struct { + index int + entry Entry + err error + reader *Reader +} + +func (it *EntryIterator) Next() bool { + if it.err != nil { + return false + } + + entryCount := it.reader.EntryCount() + + if it.index >= int(entryCount-1) { + return false + } + + entry, err := it.reader.EntryAt(it.index) + if err != nil { + it.err = errors.WithStack(err) + + return false + } + + it.entry = entry + it.index++ + + return true +} + +func (it *EntryIterator) Err() error { + return it.err +} + +func (it *EntryIterator) Index() int { + return it.index +} + +func (it *EntryIterator) Entry() Entry { + return it.entry +} diff --git a/pkg/bundle/zim/error.go b/pkg/bundle/zim/error.go index 236a681..19b1766 100644 --- a/pkg/bundle/zim/error.go +++ b/pkg/bundle/zim/error.go @@ -2,4 +2,9 @@ package zim import "errors" -var ErrNotFound = errors.New("not found") +var ( + ErrInvalidEntryIndex = errors.New("invalid entry index") + ErrNotFound = errors.New("not found") + ErrInvalidRedirect = errors.New("invalid redirect") + ErrCompressionAlgorithmNotSupported = errors.New("compression algorithm not supported") +) diff --git a/pkg/bundle/zim/option.go b/pkg/bundle/zim/option.go new file mode 100644 index 0000000..2fd07d9 --- /dev/null +++ b/pkg/bundle/zim/option.go @@ -0,0 +1,38 @@ +package zim + +import "time" + +type Options struct { + URLCacheSize int + URLCacheTTL time.Duration + TitleCacheSize int + TitleCacheTTL time.Duration +} + +type OptionFunc func(opts *Options) + +func NewOptions(funcs ...OptionFunc) *Options { + funcs = append([]OptionFunc{ + WithURLCacheSize(64), + WithTitleCacheSize(64), + }, funcs...) + + opts := &Options{} + for _, fn := range funcs { + fn(opts) + } + + return opts +} + +func WithURLCacheSize(size int) OptionFunc { + return func(opts *Options) { + opts.URLCacheSize = size + } +} + +func WithTitleCacheSize(size int) OptionFunc { + return func(opts *Options) { + opts.TitleCacheSize = size + } +} diff --git a/pkg/bundle/zim/reader.go b/pkg/bundle/zim/reader.go new file mode 100644 index 0000000..86b87b6 --- /dev/null +++ b/pkg/bundle/zim/reader.go @@ -0,0 +1,522 @@ +package zim + +import ( + "encoding/binary" + "fmt" + "io" + "os" + "strings" + + lru "github.com/hashicorp/golang-lru/v2" + "github.com/pkg/errors" +) + +const zimFormatMagicNumber uint32 = 0x44D495A +const nullByte = '\x00' +const zimRedirect = 0xffff + +type Reader struct { + majorVersion uint16 + minorVersion uint16 + uuid string + entryCount uint32 + clusterCount uint32 + urlPtrPos uint64 + titlePtrPos uint64 + clusterPtrPos uint64 + mimeListPos uint64 + mainPage uint32 + layoutPage uint32 + checksumPos uint64 + + mimeTypes []string + + urlIndex []uint64 + + urlCache *lru.Cache[string, uint64] + titleCache *lru.Cache[string, uint64] + + seeker io.ReadSeekCloser +} + +func (r *Reader) Version() (majorVersion, minorVersion uint16) { + return r.majorVersion, r.minorVersion +} + +func (r *Reader) EntryCount() uint32 { + return r.entryCount +} + +func (r *Reader) ClusterCount() uint32 { + return r.clusterCount +} + +func (r *Reader) UUID() string { + return r.uuid +} + +func (r *Reader) Entries() *EntryIterator { + return &EntryIterator{ + reader: r, + } +} + +func (r *Reader) EntryAt(idx int) (Entry, error) { + if idx >= len(r.urlIndex) || idx < 0 { + return nil, errors.Wrapf(ErrInvalidEntryIndex, "index '%d' out of bounds", idx) + } + + entryPtr := r.urlIndex[idx] + + entry, err := r.parseEntryAt(int64(entryPtr)) + if err != nil { + return nil, errors.WithStack(err) + } + + r.cacheEntry(entryPtr, entry) + + return entry, nil +} + +func (r *Reader) EntryWithURL(ns Namespace, url string) (Entry, error) { + offset, found := r.getEntryOffsetByURLFromCache(ns, url) + if found { + entry, err := r.parseEntryAt(int64(offset)) + if err != nil { + return nil, errors.WithStack(err) + } + + return entry, nil + } + + iterator := r.Entries() + + for iterator.Next() { + entry := iterator.Entry() + + if entry.Namespace() == ns && entry.URL() == url { + return entry, nil + } + } + if err := iterator.Err(); err != nil { + return nil, errors.WithStack(err) + } + + return nil, errors.WithStack(ErrNotFound) +} + +func (r *Reader) EntryWithTitle(title string) (Entry, error) { + offset, found := r.getEntryOffsetByTitleFromCache(title) + if found { + entry, err := r.parseEntryAt(int64(offset)) + if err != nil { + return nil, errors.WithStack(err) + } + + return entry, nil + } + + iterator := r.Entries() + + for iterator.Next() { + entry := iterator.Entry() + + if entry.Title() == title { + return entry, nil + } + } + if err := iterator.Err(); err != nil { + return nil, errors.WithStack(err) + } + + return nil, errors.WithStack(ErrNotFound) +} + +func (r *Reader) getURLCacheKey(entry Entry) string { + return fmt.Sprintf("%s/%s", entry.Namespace(), entry.URL()) +} + +func (r *Reader) cacheEntry(offset uint64, entry Entry) { + urlKey := r.getURLCacheKey(entry) + r.urlCache.Add(urlKey, offset) + r.titleCache.Add(entry.Title(), offset) +} + +func (r *Reader) getEntryOffsetByURLFromCache(namespace Namespace, url string) (uint64, bool) { + key := fmt.Sprintf("%s/%s", namespace, url) + return r.urlCache.Get(key) +} + +func (r *Reader) getEntryOffsetByTitleFromCache(title string) (uint64, bool) { + return r.titleCache.Get(title) +} + +func (r *Reader) parse() error { + if err := r.parseHeader(); err != nil { + return errors.WithStack(err) + } + + if err := r.parseMimeTypes(); err != nil { + return errors.WithStack(err) + } + + if err := r.parseURLIndex(); err != nil { + return errors.WithStack(err) + } + + return nil +} + +func (r *Reader) parseHeader() error { + magicNumber, err := r.readUint32At(0) + if err != nil { + return errors.WithStack(err) + } + + if magicNumber != zimFormatMagicNumber { + return errors.Errorf("invalid zim magic number '%d'", magicNumber) + } + + majorVersion, err := r.readUint16At(4) + if err != nil { + return errors.WithStack(err) + } + + r.majorVersion = majorVersion + + minorVersion, err := r.readUint16At(6) + if err != nil { + return errors.WithStack(err) + } + + r.minorVersion = minorVersion + + if err := r.parseUUID(); err != nil { + return errors.WithStack(err) + } + + entryCount, err := r.readUint32At(24) + if err != nil { + return errors.WithStack(err) + } + + r.entryCount = entryCount + + clusterCount, err := r.readUint32At(28) + if err != nil { + return errors.WithStack(err) + } + + r.clusterCount = clusterCount + + urlPtrPos, err := r.readUint64At(32) + if err != nil { + return errors.WithStack(err) + } + + r.urlPtrPos = urlPtrPos + + titlePtrPos, err := r.readUint64At(40) + if err != nil { + return errors.WithStack(err) + } + + r.titlePtrPos = titlePtrPos + + clusterPtrPos, err := r.readUint64At(48) + if err != nil { + return errors.WithStack(err) + } + + r.clusterPtrPos = clusterPtrPos + + mimeListPos, err := r.readUint64At(56) + if err != nil { + return errors.WithStack(err) + } + + r.mimeListPos = mimeListPos + + mainPage, err := r.readUint32At(64) + if err != nil { + return errors.WithStack(err) + } + + r.mainPage = mainPage + + layoutPage, err := r.readUint32At(68) + if err != nil { + return errors.WithStack(err) + } + + r.layoutPage = layoutPage + + checksumPos, err := r.readUint64At(72) + if err != nil { + return errors.WithStack(err) + } + + r.checksumPos = checksumPos + + return nil +} + +func (r *Reader) parseUUID() error { + data := make([]byte, 16) + if err := r.readRange(8, data); err != nil { + return errors.WithStack(err) + } + + parts := make([]string, 0, 5) + + val32, err := readUint32(data[0:4], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%08x", val32)) + + val16, err := readUint16(data[4:6], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%04x", val16)) + + val16, err = readUint16(data[6:8], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%04x", val16)) + + val16, err = readUint16(data[8:10], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%04x", val16)) + + val32, err = readUint32(data[10:14], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + val16, err = readUint16(data[14:16], binary.BigEndian) + if err != nil { + return errors.WithStack(err) + } + + parts = append(parts, fmt.Sprintf("%x%x", val32, val16)) + + r.uuid = strings.Join(parts, "-") + + return nil +} + +func (r *Reader) parseMimeTypes() error { + mimeTypes := make([]string, 0) + + offset := int64(r.mimeListPos) + for { + mimeType, read, err := r.readStringAt(offset) + if err != nil { + return errors.WithStack(err) + } + + if mimeType == "" { + break + } + + mimeTypes = append(mimeTypes, mimeType) + + offset += read + 1 + } + + r.mimeTypes = mimeTypes + + return nil +} + +func (r *Reader) parseURLIndex() error { + urlIndex, err := r.parseEntryIndex(int64(r.urlPtrPos)) + if err != nil { + return errors.WithStack(err) + } + + r.urlIndex = urlIndex + + return nil +} + +func (r *Reader) parseEntryAt(offset int64) (Entry, error) { + base, err := r.parseBaseEntry(offset) + if err != nil { + return nil, errors.WithStack(err) + } + + var entry Entry + + if base.mimeTypeIndex == zimRedirect { + entry, err = r.parseRedirectEntry(offset, base) + if err != nil { + return nil, errors.WithStack(err) + } + } else { + entry, err = r.parseContentEntry(offset, base) + if err != nil { + return nil, errors.WithStack(err) + } + } + + return entry, nil +} + +func (r *Reader) parseEntryIndex(startAddr int64) ([]uint64, error) { + index := make([]uint64, r.entryCount) + + data := make([]byte, 8) + for i := int64(0); i < int64(r.entryCount); i++ { + if err := r.readRange(startAddr+i*8, data); err != nil { + return nil, errors.WithStack(err) + } + + ptr, err := readUint64(data, binary.LittleEndian) + if err != nil { + return nil, errors.WithStack(err) + } + + index[i] = ptr + } + + return index, nil +} + +func (r *Reader) readRange(offset int64, v []byte) error { + if _, err := r.seeker.Seek(offset, io.SeekStart); err != nil { + return errors.WithStack(err) + } + + read, err := r.seeker.Read(v) + if err != nil { + return errors.WithStack(err) + } + + if read != len(v) { + return errors.New("could not read enough bytes") + } + + return nil +} + +func (r *Reader) readUint32At(offset int64) (uint32, error) { + data := make([]byte, 4) + if err := r.readRange(offset, data); err != nil { + return 0, errors.WithStack(err) + } + + value, err := readUint32(data, binary.LittleEndian) + if err != nil { + return 0, errors.WithStack(err) + } + + return value, nil +} + +func (r *Reader) readUint16At(offset int64) (uint16, error) { + data := make([]byte, 2) + if err := r.readRange(offset, data); err != nil { + return 0, errors.WithStack(err) + } + + value, err := readUint16(data, binary.LittleEndian) + if err != nil { + return 0, errors.WithStack(err) + } + + return value, nil +} + +func (r *Reader) readUint64At(offset int64) (uint64, error) { + data := make([]byte, 8) + if err := r.readRange(offset, data); err != nil { + return 0, errors.WithStack(err) + } + + value, err := readUint64(data, binary.LittleEndian) + if err != nil { + return 0, errors.WithStack(err) + } + + return value, nil +} + +func (r *Reader) readStringAt(offset int64) (string, int64, error) { + data := make([]byte, 1) + var sb strings.Builder + read := int64(0) + for { + if err := r.readRange(offset+read, data); err != nil { + return "", read, errors.WithStack(err) + } + + if err := sb.WriteByte(data[0]); err != nil { + return "", read, errors.WithStack(err) + } + + if data[0] == nullByte { + str := strings.TrimRight(sb.String(), "\x00") + return str, read, nil + } + + read++ + } +} + +func (r *Reader) Close() error { + if err := r.seeker.Close(); err != nil { + return errors.WithStack(err) + } + + return nil +} + +func NewReader(seeker io.ReadSeekCloser, funcs ...OptionFunc) (*Reader, error) { + opts := NewOptions(funcs...) + + urlCache, err := lru.New[string, uint64](opts.URLCacheSize) + if err != nil { + return nil, errors.WithStack(err) + } + + titleCache, err := lru.New[string, uint64](opts.TitleCacheSize) + if err != nil { + return nil, errors.WithStack(err) + } + + reader := &Reader{ + seeker: seeker, + urlCache: urlCache, + titleCache: titleCache, + } + + if err := reader.parse(); err != nil { + return nil, errors.WithStack(err) + } + + return reader, nil +} + +func Open(path string, funcs ...OptionFunc) (*Reader, error) { + file, err := os.OpenFile(path, os.O_RDONLY, os.ModePerm) + if err != nil { + return nil, errors.WithStack(err) + } + + reader, err := NewReader(file, funcs...) + if err != nil { + return nil, errors.WithStack(err) + } + + return reader, nil +} diff --git a/pkg/bundle/zim/reader_test.go b/pkg/bundle/zim/reader_test.go new file mode 100644 index 0000000..a53e4bf --- /dev/null +++ b/pkg/bundle/zim/reader_test.go @@ -0,0 +1,83 @@ +package zim + +import ( + "log" + "path/filepath" + "testing" + + "github.com/davecgh/go-spew/spew" + "github.com/pkg/errors" +) + +func TestReader(t *testing.T) { + files, err := filepath.Glob("testdata/*.zim") + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + for _, zf := range files { + testName := filepath.Base(zf) + t.Run(testName, func(t *testing.T) { + reader, err := Open(zf) + if err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + + defer func() { + if err := reader.Close(); err != nil { + t.Fatalf("%+v", errors.WithStack(err)) + } + }() + + iterator := reader.Entries() + for iterator.Next() { + entry := iterator.Entry() + + content, err := entry.Redirect() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + break + } + + log.Printf("%s/%s: %s", content.Namespace(), content.URL(), content.Title()) + + contentReader, err := content.Reader() + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + break + } + + spew.Dump(contentReader) + } + if err := iterator.Err(); err != nil { + if err != nil { + t.Errorf("%+v", errors.WithStack(err)) + } + } + }) + + // entry, err := reader.EntryWithURL(V6NamespaceContent, "A/a.tile.openstreetmap.org/16/33682/22970.png") + // if err != nil { + // t.Fatalf("%+v", errors.WithStack(err)) + // } + + // content, err := entry.Redirect() + // if err != nil { + // t.Fatalf("%+v", errors.WithStack(err)) + // } + + // contentReader, err := content.Reader() + // if err != nil { + // t.Fatalf("%+v", errors.WithStack(err)) + // break + // } + + // data, err := io.ReadAll(contentReader) + // if err != nil { + // t.Fatalf("%+v", errors.WithStack(err)) + // break + // } + + // spew.Dump(data) + } +} diff --git a/pkg/bundle/zim/testdata/cadoles.zim b/pkg/bundle/zim/testdata/cadoles.zim new file mode 100644 index 0000000..a218b84 Binary files /dev/null and b/pkg/bundle/zim/testdata/cadoles.zim differ diff --git a/pkg/bundle/zim/util.go b/pkg/bundle/zim/util.go new file mode 100644 index 0000000..fd53347 --- /dev/null +++ b/pkg/bundle/zim/util.go @@ -0,0 +1,52 @@ +package zim + +import ( + "bytes" + "encoding/binary" + + "github.com/pkg/errors" +) + +// read a little endian uint64 +func readUint64(b []byte, order binary.ByteOrder) (uint64, error) { + var v uint64 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} + +// read a little endian uint32 +func readUint32(b []byte, order binary.ByteOrder) (uint32, error) { + var v uint32 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} + +// read a little endian uint16 +func readUint16(b []byte, order binary.ByteOrder) (uint16, error) { + var v uint16 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} + +// read a little endian uint8 +func readUint8(b []byte, order binary.ByteOrder) (uint8, error) { + var v uint8 + buf := bytes.NewBuffer(b) + if err := binary.Read(buf, order, &v); err != nil { + return 0, errors.WithStack(err) + } + + return v, nil +} diff --git a/pkg/bundle/zim/zim_test.go b/pkg/bundle/zim/zim_test.go deleted file mode 100644 index 3c85463..0000000 --- a/pkg/bundle/zim/zim_test.go +++ /dev/null @@ -1,150 +0,0 @@ -package zim - -import ( - "log" - "testing" - - "github.com/pkg/errors" -) - -var Z *ZimReader - -func init() { - var err error - Z, err = NewReader("testdata/wikibooks_af_all_maxi_2023-06.zim") - if err != nil { - log.Panicf("Can't read %v", err) - } -} - -func TestOpen(t *testing.T) { - if Z.ArticleCount == 0 { - t.Errorf("No article found") - } -} - -func TestMime(t *testing.T) { - if len(Z.MimeTypes()) == 0 { - t.Errorf("No mime types found") - } -} - -func TestDisplayInfost(t *testing.T) { - info := Z.String() - if len(info) < 0 { - t.Errorf("Can't read infos") - } - t.Log(info) -} - -func TestURLAtIdx(t *testing.T) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(5) - a, _ := Z.ArticleAt(p) - if a == nil { - t.Errorf("Can't find 1st url") - } -} - -func TestDisplayArticle(t *testing.T) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(5) - a, _ := Z.ArticleAt(p) - if a == nil { - t.Errorf("Can't find 1st url") - } - - t.Log(a) -} - -func TestPageNoIndex(t *testing.T) { - a, _ := Z.GetPageNoIndex("A/Dracula:Capitol_1.html") - if a == nil { - t.Errorf("Can't find existing url") - } -} - -func TestListArticles(t *testing.T) { - if testing.Short() { - t.Skip("skipping test in short mode.") - } - - var i uint32 - - for a := range Z.ListArticles() { - i++ - t.Log(a.String()) - } - - if i == 0 { - t.Errorf("Can't find any urls") - } - - if i != Z.ArticleCount-1 { - t.Errorf("Can't find the exact ArticleCount urls %d vs %d", i, Z.ArticleCount) - } -} - -func TestMainPage(t *testing.T) { - a, _ := Z.MainPage() - if a == nil { - t.Errorf("Can't find the mainpage article") - } - - t.Log(a) -} - -func TestFavicon(t *testing.T) { - favicon, err := Z.Favicon() - if err != nil { - t.Errorf("%+v", errors.WithStack(err)) - } - if favicon == nil { - t.Errorf("Can't find the favicon article") - } -} - -func TestMetadata(t *testing.T) { - metadata, err := Z.Metadata() - if err != nil { - t.Errorf("%+v", errors.WithStack(err)) - } - if metadata == nil { - t.Errorf("Can't find the metadata") - } -} - -func TestData(t *testing.T) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(2) - a, _ := Z.ArticleAt(p) - b, _ := a.Data() - data := string(b) - if a.EntryType != RedirectEntry { - if len(data) == 0 { - t.Error("can't read data") - } - } - t.Log(a.String()) - t.Log(data) -} - -func BenchmarkArticleBytes(b *testing.B) { - // addr 0 is a redirect - p, _ := Z.OffsetAtURLIdx(5) - a, _ := Z.ArticleAt(p) - if a == nil { - b.Errorf("Can't find 1st url") - } - data, err := a.Data() - if err != nil { - b.Error(err) - } - - b.SetBytes(int64(len(data))) - b.ResetTimer() - for i := 0; i < b.N; i++ { - a.Data() - bcache.Purge() // prevent memiozing value - } -}