284 lines
6.5 KiB
Go
284 lines
6.5 KiB
Go
|
package zim
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"io/ioutil"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
|
||
|
lru "github.com/hashicorp/golang-lru/v2"
|
||
|
)
|
||
|
|
||
|
const (
|
||
|
RedirectEntry uint16 = 0xffff
|
||
|
LinkTargetEntry = 0xfffe
|
||
|
DeletedEntry = 0xfffd
|
||
|
)
|
||
|
|
||
|
var articlePool sync.Pool
|
||
|
|
||
|
// the recent uncompressed blobs, mainly useful while indexing and asking
|
||
|
// for the same blob again and again
|
||
|
var bcache *lru.Cache[any, any]
|
||
|
|
||
|
type Article struct {
|
||
|
// EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx
|
||
|
// pointing to ZimReader.mimeTypeList
|
||
|
EntryType uint16
|
||
|
Title string
|
||
|
URLPtr uint64
|
||
|
Namespace byte
|
||
|
url string
|
||
|
blob uint32
|
||
|
cluster uint32
|
||
|
z *ZimReader
|
||
|
}
|
||
|
|
||
|
// convenient method to return the Article at URL index idx
|
||
|
func (z *ZimReader) ArticleAtURLIdx(idx uint32) (*Article, error) {
|
||
|
o, err := z.OffsetAtURLIdx(idx)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
return z.ArticleAt(o)
|
||
|
}
|
||
|
|
||
|
// return the article main page if it exists
|
||
|
func (z *ZimReader) MainPage() (*Article, error) {
|
||
|
if z.mainPage == 0xffffffff {
|
||
|
return nil, nil
|
||
|
}
|
||
|
return z.ArticleAtURLIdx(z.mainPage)
|
||
|
}
|
||
|
|
||
|
// get the article (Directory) pointed by the offset found in URLpos or Titlepos
|
||
|
func (z *ZimReader) ArticleAt(offset uint64) (*Article, error) {
|
||
|
a := articlePool.Get().(*Article)
|
||
|
err := z.FillArticleAt(a, offset)
|
||
|
return a, err
|
||
|
}
|
||
|
|
||
|
// Fill an article with datas found at offset
|
||
|
func (z *ZimReader) FillArticleAt(a *Article, offset uint64) error {
|
||
|
a.z = z
|
||
|
a.URLPtr = offset
|
||
|
|
||
|
mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2))
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("can't read article %w", err)
|
||
|
}
|
||
|
a.EntryType = mimeIdx
|
||
|
|
||
|
// Linktarget or Target Entry
|
||
|
if mimeIdx == LinkTargetEntry || mimeIdx == DeletedEntry {
|
||
|
// TODO
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
s, err := z.bytesRangeAt(offset+3, offset+4)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
a.Namespace = s[0]
|
||
|
|
||
|
a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4))
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4))
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
// Redirect
|
||
|
if mimeIdx == RedirectEntry {
|
||
|
// assume the url + title won't be longer than 2k
|
||
|
b, err := z.bytesRangeAt(offset+12, offset+12+2048)
|
||
|
if err != nil {
|
||
|
return nil
|
||
|
}
|
||
|
bbuf := bytes.NewBuffer(b)
|
||
|
a.url, err = bbuf.ReadString('\x00')
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
a.url = strings.TrimRight(a.url, "\x00")
|
||
|
|
||
|
a.Title, err = bbuf.ReadString('\x00')
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
a.Title = strings.TrimRight(a.Title, "\x00")
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
b, err := z.bytesRangeAt(offset+16, offset+16+2048)
|
||
|
if err != nil {
|
||
|
return nil
|
||
|
}
|
||
|
bbuf := bytes.NewBuffer(b)
|
||
|
a.url, err = bbuf.ReadString('\x00')
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
a.url = strings.TrimRight(string(a.url), "\x00")
|
||
|
|
||
|
title, err := bbuf.ReadString('\x00')
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
title = strings.TrimRight(string(title), "\x00")
|
||
|
// This is a trick to force a copy and avoid retain of the full buffer
|
||
|
// mainly for indexing title reasons
|
||
|
if len(title) != 0 {
|
||
|
a.Title = title[0:1] + title[1:]
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// return the uncompressed data associated with this article
|
||
|
func (a *Article) Data() ([]byte, error) {
|
||
|
// ensure we have data to read
|
||
|
if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
|
||
|
return nil, nil
|
||
|
}
|
||
|
start, end, err := a.z.clusterOffsetsAtIdx(a.cluster)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
s, err := a.z.bytesRangeAt(start, start+1)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
compression := uint8(s[0])
|
||
|
|
||
|
// blob starts at offset, blob ends at offset
|
||
|
var bs, be uint32
|
||
|
|
||
|
// LZMA: 4, Zstandard: 5
|
||
|
if compression == 4 || compression == 5 {
|
||
|
blobLookup := func() ([]byte, bool) {
|
||
|
if v, ok := bcache.Get(a.cluster); ok {
|
||
|
b := v.([]byte)
|
||
|
return b, ok
|
||
|
}
|
||
|
return nil, false
|
||
|
}
|
||
|
|
||
|
var blob []byte
|
||
|
var ok bool
|
||
|
var dec io.ReadCloser
|
||
|
if blob, ok = blobLookup(); !ok {
|
||
|
b, err := a.z.bytesRangeAt(start+1, end+1)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
bbuf := bytes.NewBuffer(b)
|
||
|
switch compression {
|
||
|
case 5:
|
||
|
dec, err = NewZstdReader(bbuf)
|
||
|
|
||
|
case 4:
|
||
|
dec, err = NewXZReader(bbuf)
|
||
|
}
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
defer dec.Close()
|
||
|
// the decoded chunk are around 1MB
|
||
|
b, err = ioutil.ReadAll(dec)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
blob = make([]byte, len(b))
|
||
|
copy(blob, b)
|
||
|
// TODO: 2 requests for the same blob could occure at the same time
|
||
|
bcache.Add(a.cluster, blob)
|
||
|
} else {
|
||
|
bi, ok := bcache.Get(a.cluster)
|
||
|
if !ok {
|
||
|
return nil, errors.New("not in cache anymore")
|
||
|
}
|
||
|
blob = bi.([]byte)
|
||
|
}
|
||
|
|
||
|
bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// avoid retaining all the chunk
|
||
|
c := make([]byte, be-bs)
|
||
|
copy(c, blob[bs:be])
|
||
|
return c, nil
|
||
|
|
||
|
} else if compression == 0 || compression == 1 {
|
||
|
// uncompresssed
|
||
|
startPos := start + 1
|
||
|
blobOffset := uint64(a.blob * 4)
|
||
|
|
||
|
bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4))
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4))
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be))
|
||
|
}
|
||
|
|
||
|
return nil, errors.New("Unhandled compression")
|
||
|
}
|
||
|
|
||
|
func (a *Article) MimeType() string {
|
||
|
if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
|
||
|
return ""
|
||
|
}
|
||
|
|
||
|
return a.z.mimeTypeList[a.EntryType]
|
||
|
}
|
||
|
|
||
|
// return the url prefixed by the namespace
|
||
|
func (a *Article) FullURL() string {
|
||
|
return string(a.Namespace) + "/" + a.url
|
||
|
}
|
||
|
|
||
|
func (a *Article) String() string {
|
||
|
return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x",
|
||
|
a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob)
|
||
|
}
|
||
|
|
||
|
// RedirectIndex return the redirect index of RedirectEntry type article
|
||
|
// return an err if not a redirect entry
|
||
|
func (a *Article) RedirectIndex() (uint32, error) {
|
||
|
if a.EntryType != RedirectEntry {
|
||
|
return 0, errors.New("Not a RedirectEntry")
|
||
|
}
|
||
|
// We use the cluster to save the redirect index position for RedirectEntry type
|
||
|
return a.cluster, nil
|
||
|
}
|
||
|
|
||
|
func (a *Article) blobOffsetsAtIdx(z *ZimReader) (start, end uint64) {
|
||
|
idx := a.blob
|
||
|
offset := z.clusterPtrPos + uint64(idx)*8
|
||
|
start, err := readInt64(z.bytesRangeAt(offset, offset+8))
|
||
|
if err != nil {
|
||
|
return
|
||
|
}
|
||
|
offset = z.clusterPtrPos + uint64(idx+1)*8
|
||
|
end, _ = readInt64(z.bytesRangeAt(offset, offset+8))
|
||
|
|
||
|
return
|
||
|
}
|