edge/pkg/bundle/oldzim/zim.go

319 lines
6.9 KiB
Go

package zim
import (
"bytes"
"fmt"
"io"
"os"
"strings"
"sync"
lru "github.com/hashicorp/golang-lru/v2"
"github.com/pkg/errors"
)
const (
zimHeader = 72173914
)
// ZimReader keep tracks of everything related to ZIM reading
type ZimReader struct {
f *os.File
UUID uint32
ArticleCount uint32
clusterCount uint32
urlPtrPos uint64
titlePtrPos uint64
clusterPtrPos uint64
mimeListPos uint64
mainPage uint32
layoutPage uint32
mimeTypeList []string
}
// create a new zim reader
func NewReader(path string) (*ZimReader, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
z := ZimReader{f: f, mainPage: 0xffffffff, layoutPage: 0xffffffff}
articlePool = sync.Pool{
New: func() interface{} {
return new(Article)
},
}
// keep 4 latest uncompressed blobs, around 1M per blob
bcache, _ = lru.New[any, any](5)
err = z.readFileHeaders()
return &z, err
}
// Return an ordered list of mime types present in the ZIM file
func (z *ZimReader) MimeTypes() []string {
if len(z.mimeTypeList) != 0 {
return z.mimeTypeList
}
var s []string
// assume mime list fit in 2k
b, err := z.bytesRangeAt(z.mimeListPos, z.mimeListPos+2048)
if err != nil {
return s
}
bbuf := bytes.NewBuffer(b)
for {
line, err := bbuf.ReadBytes('\x00')
if err != nil && err != io.EOF {
return s
}
// a line of 1 is a line containing only \x00 and it's the marker for the
// end of mime types list
if len(line) == 1 {
break
}
s = append(s, strings.TrimRight(string(line), "\x00"))
}
z.mimeTypeList = s
return s
}
// list all articles, using url index, contained in a zim file
// note that this is a slow implementation, a real iterator is faster
// you are not suppose to use this method on big zim files, use indexes
func (z *ZimReader) ListArticles() <-chan *Article {
ch := make(chan *Article, 10)
go func() {
var idx uint32
// starting at 1 to avoid "con" entry
var start uint32 = 1
for idx = start; idx < z.ArticleCount; idx++ {
art, err := z.ArticleAtURLIdx(idx)
if err != nil {
continue
}
if art == nil {
// TODO: deal with redirect continue
continue
}
ch <- art
}
close(ch)
}()
return ch
}
// list all title pointer, Titles by position contained in a zim file
// Titles are pointers to URLpos index, useful for indexing cause smaller to store: uint32
// note that this is a slow implementation, a real iterator is faster
// you are not suppose to use this method on big zim files prefer ListTitlesPtrIterator to build your index
func (z *ZimReader) ListTitlesPtr() <-chan uint32 {
ch := make(chan uint32, 10)
go func() {
var pos uint64
var count uint32
for pos = z.titlePtrPos; count < z.ArticleCount; pos += 4 {
idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
if err != nil {
continue
}
ch <- idx
count++
}
close(ch)
}()
return ch
}
// list all title pointer, Titles by position contained in a zim file
// Titles are pointers to URLpos index, usefull for indexing cause smaller to store: uint32
func (z *ZimReader) ListTitlesPtrIterator(cb func(uint32)) {
var count uint32
for pos := z.titlePtrPos; count < z.ArticleCount; pos += 4 {
idx, err := readInt32(z.bytesRangeAt(pos, pos+4))
if err != nil {
continue
}
cb(idx)
count++
}
}
// return the article at the exact url not using any index
func (z *ZimReader) GetPageNoIndex(url string) (*Article, error) {
// starting at 1 to avoid "con" entry
var start uint32
stop := z.ArticleCount
a := new(Article)
for {
pos := (start + stop) / 2
offset, err := z.OffsetAtURLIdx(pos)
if err != nil {
return nil, err
}
err = z.FillArticleAt(a, offset)
if err != nil {
return nil, err
}
if a.FullURL() == url {
return a, nil
}
if a.FullURL() > url {
stop = pos
} else {
start = pos
}
if stop-start == 1 {
break
}
}
return nil, errors.WithStack(ErrNotFound)
}
// get the offset pointing to Article at pos in the URL idx
func (z *ZimReader) OffsetAtURLIdx(idx uint32) (uint64, error) {
offset := z.urlPtrPos + uint64(idx)*8
return readInt64(z.bytesRangeAt(offset, offset+8))
}
// Close & cleanup the zimreader
func (z *ZimReader) Close() error {
return z.f.Close()
}
func (z *ZimReader) String() string {
fi, err := z.f.Stat()
if err != nil {
return "corrupted zim"
}
return fmt.Sprintf("Size: %d, ArticleCount: %d urlPtrPos: 0x%x titlePtrPos: 0x%x mimeListPos: 0x%x clusterPtrPos: 0x%x\nMimeTypes: %v",
fi.Size(), z.ArticleCount, z.urlPtrPos, z.titlePtrPos, z.mimeListPos, z.clusterPtrPos, z.MimeTypes())
}
// getBytesRangeAt returns bytes from start to end
// it's needed to abstract mmap usages rather than read directly on the mmap slices
func (z *ZimReader) bytesRangeAt(start, end uint64) ([]byte, error) {
buf := make([]byte, end-start)
n, err := z.f.ReadAt(buf, int64(start))
if err != nil {
return nil, fmt.Errorf("can't read bytes %w", err)
}
if n != int(end-start) {
return nil, errors.New("can't read enough bytes")
}
return buf, nil
}
// populate the ZimReader structs with headers
func (z *ZimReader) readFileHeaders() error {
// checking for file type
v, err := readInt32(z.bytesRangeAt(0, 0+4))
if err != nil || v != zimHeader {
return errors.New("not a ZIM file")
}
// checking for version
v, err = readInt32(z.bytesRangeAt(4, 4+4))
if err != nil {
return errors.Wrap(err, "could not read file version")
}
// checking for articles count
v, err = readInt32(z.bytesRangeAt(8, 16))
if err != nil {
return err
}
z.UUID = v
// checking for articles count
v, err = readInt32(z.bytesRangeAt(24, 24+4))
if err != nil {
return err
}
z.ArticleCount = v
// checking for cluster count
v, err = readInt32(z.bytesRangeAt(28, 28+4))
if err != nil {
return err
}
z.clusterCount = v
// checking for urlPtrPos
vb, err := readInt64(z.bytesRangeAt(32, 32+8))
if err != nil {
return err
}
z.urlPtrPos = vb
// checking for titlePtrPos
vb, err = readInt64(z.bytesRangeAt(40, 40+8))
if err != nil {
return err
}
z.titlePtrPos = vb
// checking for clusterPtrPos
vb, err = readInt64(z.bytesRangeAt(48, 48+8))
if err != nil {
return err
}
z.clusterPtrPos = vb
// checking for mimeListPos
vb, err = readInt64(z.bytesRangeAt(56, 56+8))
if err != nil {
return err
}
z.mimeListPos = vb
// checking for mainPage
v, err = readInt32(z.bytesRangeAt(64, 64+4))
if err != nil {
return err
}
z.mainPage = v
// checking for layoutPage
v, err = readInt32(z.bytesRangeAt(68, 68+4))
if err != nil {
return err
}
z.layoutPage = v
z.MimeTypes()
return nil
}
// return start and end offsets for cluster at index idx
func (z *ZimReader) clusterOffsetsAtIdx(idx uint32) (start, end uint64, err error) {
offset := z.clusterPtrPos + (uint64(idx) * 8)
start, err = readInt64(z.bytesRangeAt(offset, offset+8))
if err != nil {
return
}
offset = z.clusterPtrPos + (uint64(idx+1) * 8)
end, err = readInt64(z.bytesRangeAt(offset, offset+8))
if err != nil {
return
}
end--
return
}