mirror of
https://github.com/mickael-kerjean/filestash.git
synced 2025-11-01 19:32:27 +08:00
86 lines
1.6 KiB
Go
86 lines
1.6 KiB
Go
package formater
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"encoding/xml"
|
|
"fmt"
|
|
. "github.com/mickael-kerjean/filestash/server/common"
|
|
"io"
|
|
"math/rand"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
func OfficeFormater(r io.ReadCloser) (io.ReadCloser, error) {
|
|
tmpName := fmt.Sprintf("/tmp/docx_%d.docx", rand.Intn(1000000))
|
|
defer os.Remove(tmpName)
|
|
f, err := os.OpenFile(tmpName, os.O_CREATE|os.O_WRONLY, os.ModePerm)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
_, err = io.Copy(f, r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
z, err := zip.OpenReader(tmpName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer z.Close()
|
|
|
|
hasData := false
|
|
content := bytes.NewBuffer([]byte{})
|
|
for _, f := range z.File {
|
|
shouldExtract := false
|
|
if f.Name == "word/document.xml" {
|
|
shouldExtract = true
|
|
}
|
|
if strings.HasPrefix(f.Name, "ppt/slides/slide") {
|
|
shouldExtract = true
|
|
}
|
|
|
|
if shouldExtract == false {
|
|
continue
|
|
}
|
|
hasData = true
|
|
o, err := f.Open()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
dec := xml.NewDecoder(o)
|
|
for {
|
|
t, err := dec.Token()
|
|
if err != nil {
|
|
break
|
|
}
|
|
if t == nil {
|
|
break
|
|
}
|
|
switch el := t.(type) {
|
|
case xml.StartElement:
|
|
if el.Name.Local == "t" {
|
|
w := WordDoc{}
|
|
dec.DecodeElement(&w, &el)
|
|
if len(w.Text) > 0 {
|
|
w.Text = regexp.MustCompile("\\s+\\.\\s+").ReplaceAll(w.Text, []byte(". "))
|
|
w.Text = regexp.MustCompile("\\s{2,}").ReplaceAll(w.Text, []byte(" "))
|
|
content.Write(w.Text)
|
|
content.Write([]byte(" "))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if hasData == false {
|
|
return nil, ErrNotFound
|
|
}
|
|
return NewReadCloserFromReader(content), nil
|
|
}
|
|
|
|
type WordDoc struct {
|
|
Text []byte `xml:",innerxml"`
|
|
}
|