319 lines
6.9 KiB
Go
319 lines
6.9 KiB
Go
package zim
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
|
|
lru "github.com/hashicorp/golang-lru/v2"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
const (
|
|
zimHeader = 72173914
|
|
)
|
|
|
|
// ZimReader keep tracks of everything related to ZIM reading
|
|
type ZimReader struct {
|
|
f *os.File
|
|
UUID uint32
|
|
ArticleCount uint32
|
|
clusterCount uint32
|
|
urlPtrPos uint64
|
|
titlePtrPos uint64
|
|
clusterPtrPos uint64
|
|
mimeListPos uint64
|
|
mainPage uint32
|
|
layoutPage uint32
|
|
mimeTypeList []string
|
|
}
|
|
|
|
// create a new zim reader
|
|
func NewReader(path string) (*ZimReader, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
z := ZimReader{f: f, mainPage: 0xffffffff, layoutPage: 0xffffffff}
|
|
|
|
articlePool = sync.Pool{
|
|
New: func() interface{} {
|
|
return new(Article)
|
|
},
|
|
}
|
|
// keep 4 latest uncompressed blobs, around 1M per blob
|
|
bcache, _ = lru.New[any, any](5)
|
|
|
|
err = z.readFileHeaders()
|
|
return &z, err
|
|
}
|
|
|
|
// Return an ordered list of mime types present in the ZIM file
|
|
func (z *ZimReader) MimeTypes() []string {
|
|
if len(z.mimeTypeList) != 0 {
|
|
return z.mimeTypeList
|
|
}
|
|
|
|
var s []string
|
|
// assume mime list fit in 2k
|
|
b, err := z.bytesRangeAt(z.mimeListPos, z.mimeListPos+2048)
|
|
if err != nil {
|
|
return s
|
|
}
|
|
bbuf := bytes.NewBuffer(b)
|
|
|
|
for {
|
|
line, err := bbuf.ReadBytes('\x00')
|
|
if err != nil && err != io.EOF {
|
|
return s
|
|
}
|
|
// a line of 1 is a line containing only \x00 and it's the marker for the
|
|
// end of mime types list
|
|
if len(line) == 1 {
|
|
break
|
|
}
|
|
s = append(s, strings.TrimRight(string(line), "\x00"))
|
|
}
|
|
z.mimeTypeList = s
|
|
return s
|
|
}
|
|
|
|
// list all articles, using url index, contained in a zim file
|
|
// note that this is a slow implementation, a real iterator is faster
|
|
// you are not suppose to use this method on big zim files, use indexes
|
|
func (z *ZimReader) ListArticles() <-chan *Article {
|
|
ch := make(chan *Article, 10)
|
|
|
|
go func() {
|
|
var idx uint32
|
|
// starting at 1 to avoid "con" entry
|
|
var start uint32 = 1
|
|
|
|
for idx = start; idx < z.ArticleCount; idx++ {
|
|
art, err := z.ArticleAtURLIdx(idx)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if art == nil {
|
|
// TODO: deal with redirect continue
|
|
continue
|
|
}
|
|
ch <- art
|
|
}
|
|
close(ch)
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
// list all title pointer, Titles by position contained in a zim file
|
|
// Titles are pointers to URLpos index, useful for indexing cause smaller to store: uint32
|
|
// note that this is a slow implementation, a real iterator is faster
|
|
// you are not suppose to use this method on big zim files prefer ListTitlesPtrIterator to build your index
|
|
func (z *ZimReader) ListTitlesPtr() <-chan uint32 {
|
|
ch := make(chan uint32, 10)
|
|
|
|
go func() {
|
|
var pos uint64
|
|
var count uint32
|
|
|
|
for pos = z.titlePtrPos; count < z.ArticleCount; pos += 4 {
|
|
idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
ch <- idx
|
|
count++
|
|
}
|
|
close(ch)
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
// list all title pointer, Titles by position contained in a zim file
|
|
// Titles are pointers to URLpos index, usefull for indexing cause smaller to store: uint32
|
|
func (z *ZimReader) ListTitlesPtrIterator(cb func(uint32)) {
|
|
var count uint32
|
|
for pos := z.titlePtrPos; count < z.ArticleCount; pos += 4 {
|
|
idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
cb(idx)
|
|
count++
|
|
}
|
|
}
|
|
|
|
// return the article at the exact url not using any index
|
|
func (z *ZimReader) GetPageNoIndex(url string) (*Article, error) {
|
|
// starting at 1 to avoid "con" entry
|
|
var start uint32
|
|
stop := z.ArticleCount
|
|
|
|
a := new(Article)
|
|
|
|
for {
|
|
pos := (start + stop) / 2
|
|
|
|
offset, err := z.OffsetAtURLIdx(pos)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = z.FillArticleAt(a, offset)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if a.FullURL() == url {
|
|
return a, nil
|
|
}
|
|
|
|
if a.FullURL() > url {
|
|
stop = pos
|
|
} else {
|
|
start = pos
|
|
}
|
|
if stop-start == 1 {
|
|
break
|
|
}
|
|
|
|
}
|
|
return nil, errors.WithStack(ErrNotFound)
|
|
}
|
|
|
|
// get the offset pointing to Article at pos in the URL idx
|
|
func (z *ZimReader) OffsetAtURLIdx(idx uint32) (uint64, error) {
|
|
offset := z.urlPtrPos + uint64(idx)*8
|
|
return readInt64(z.bytesRangeAt(offset, offset+8))
|
|
}
|
|
|
|
// Close & cleanup the zimreader
|
|
func (z *ZimReader) Close() error {
|
|
return z.f.Close()
|
|
}
|
|
|
|
func (z *ZimReader) String() string {
|
|
fi, err := z.f.Stat()
|
|
if err != nil {
|
|
return "corrupted zim"
|
|
}
|
|
return fmt.Sprintf("Size: %d, ArticleCount: %d urlPtrPos: 0x%x titlePtrPos: 0x%x mimeListPos: 0x%x clusterPtrPos: 0x%x\nMimeTypes: %v",
|
|
fi.Size(), z.ArticleCount, z.urlPtrPos, z.titlePtrPos, z.mimeListPos, z.clusterPtrPos, z.MimeTypes())
|
|
}
|
|
|
|
// getBytesRangeAt returns bytes from start to end
|
|
// it's needed to abstract mmap usages rather than read directly on the mmap slices
|
|
func (z *ZimReader) bytesRangeAt(start, end uint64) ([]byte, error) {
|
|
buf := make([]byte, end-start)
|
|
n, err := z.f.ReadAt(buf, int64(start))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("can't read bytes %w", err)
|
|
}
|
|
|
|
if n != int(end-start) {
|
|
return nil, errors.New("can't read enough bytes")
|
|
}
|
|
|
|
return buf, nil
|
|
}
|
|
|
|
// populate the ZimReader structs with headers
|
|
func (z *ZimReader) readFileHeaders() error {
|
|
// checking for file type
|
|
v, err := readInt32(z.bytesRangeAt(0, 0+4))
|
|
if err != nil || v != zimHeader {
|
|
return errors.New("not a ZIM file")
|
|
}
|
|
|
|
// checking for version
|
|
v, err = readInt32(z.bytesRangeAt(4, 4+4))
|
|
if err != nil {
|
|
return errors.Wrap(err, "could not read file version")
|
|
}
|
|
|
|
// checking for articles count
|
|
v, err = readInt32(z.bytesRangeAt(8, 16))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.UUID = v
|
|
|
|
// checking for articles count
|
|
v, err = readInt32(z.bytesRangeAt(24, 24+4))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.ArticleCount = v
|
|
|
|
// checking for cluster count
|
|
v, err = readInt32(z.bytesRangeAt(28, 28+4))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.clusterCount = v
|
|
|
|
// checking for urlPtrPos
|
|
vb, err := readInt64(z.bytesRangeAt(32, 32+8))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.urlPtrPos = vb
|
|
|
|
// checking for titlePtrPos
|
|
vb, err = readInt64(z.bytesRangeAt(40, 40+8))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.titlePtrPos = vb
|
|
|
|
// checking for clusterPtrPos
|
|
vb, err = readInt64(z.bytesRangeAt(48, 48+8))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.clusterPtrPos = vb
|
|
|
|
// checking for mimeListPos
|
|
vb, err = readInt64(z.bytesRangeAt(56, 56+8))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.mimeListPos = vb
|
|
|
|
// checking for mainPage
|
|
v, err = readInt32(z.bytesRangeAt(64, 64+4))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.mainPage = v
|
|
|
|
// checking for layoutPage
|
|
v, err = readInt32(z.bytesRangeAt(68, 68+4))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
z.layoutPage = v
|
|
|
|
z.MimeTypes()
|
|
return nil
|
|
}
|
|
|
|
// return start and end offsets for cluster at index idx
|
|
func (z *ZimReader) clusterOffsetsAtIdx(idx uint32) (start, end uint64, err error) {
|
|
offset := z.clusterPtrPos + (uint64(idx) * 8)
|
|
start, err = readInt64(z.bytesRangeAt(offset, offset+8))
|
|
if err != nil {
|
|
return
|
|
}
|
|
offset = z.clusterPtrPos + (uint64(idx+1) * 8)
|
|
end, err = readInt64(z.bytesRangeAt(offset, offset+8))
|
|
if err != nil {
|
|
return
|
|
}
|
|
end--
|
|
return
|
|
}
|