我有一个大的文本日志文件,其中包含两个由特殊字符分隔的部分,就像
...
this is the very large
part, contains lots lines.
#SPECIAL CHARS START#
...
this is the small part the the end,
contain several lines, but
we do not know how many lines
this part contains
我的要求是获得#SPECIAL CHARS START#
之后到最后的一小部分文本内容,如何使用Golang有效地获得它?
更新:我当前的解决方案是从文件末尾逐行获取并记住光标,如果该行包含特殊字符,请中断循环并使用reareay获取光标
func getBackwardLine(file *os.File, start int64) (string, int64) {
line := ""
cursor :=start
stat, _ := file.Stat()
filesize := stat.Size()
for {
cursor--
file.Seek(cursor, io.SeekEnd)
char := make([]byte, 1)
file.Read(char)
if cursor != -1 && (char[0] == 10 || char[0] == 13) {
break
}
line = fmt.Sprintf("%s%s", string(char), line)
if cursor == -filesize {
break
}
}
return line, cursor
}
func main() {
file, err := os.Open("some.log")
if err != nil {
os.Exit(1)
}
defer file.Close()
var cursor int64 = 0
var line = ""
for {
line, cursor = getBackwardLine(file, cursor)
fmt.Println(line)
if(strings.Contains(line, "#SPECIAL CHARS START#")) {
break
}
}
fmt.Println(cursor) //now we get the cursor for the start of special characters
}
此解决方案实现了向后阅读器。
[从头开始按b.Len
个字节开始读取文件,然后向前寻找一个分隔符,当前为\n
在该块内,然后将起始偏移量向前推SepIndex
(这是为了防止搜索字符串被分成两个连续的读取)。在进行下一个块读取之前,它将在块读取中查找search
字符串,如果找到,它将返回其在文件中的起始位置并停止。否则,它将起始偏移量减少b.Len
,然后读取下一个块。
只要您的搜索字符串位于文件的最后40%,您就应该获得更好的性能,但是]这需要经过实战测试。
如果您的搜索字符串在最后10%以内,我相信您会赢。
package main
import (
"bytes"
"io"
"io/ioutil"
"log"
"os"
)
func main() {
search := []byte("find me")
blockLen := 1024
fs := []*os.File{}
f1, err := ioutil.TempFile("", "")
if err != nil {
log.Fatal(err)
}
f1.Write(search)
f1.Close()
fs = append(fs, f1)
f2, err := ioutil.TempFile("", "")
if err != nil {
log.Fatal(err)
}
f2.Write(bytes.Repeat([]byte(" "), blockLen-5))
f2.Write(search)
f2.Close()
fs = append(fs, f2)
f3, err := ioutil.TempFile("", "")
if err != nil {
log.Fatal(err)
}
f3.Write(bytes.Repeat([]byte(" "), blockLen))
f3.Write(search)
f3.Close()
fs = append(fs, f3)
f4, err := ioutil.TempFile("", "")
if err != nil {
log.Fatal(err)
}
f4.Write(bytes.Repeat([]byte(" "), blockLen+10))
f4.Write(search)
f4.Close()
fs = append(fs, f4)
f5, err := ioutil.TempFile("", "")
if err != nil {
log.Fatal(err)
}
f5.Write(bytes.Repeat([]byte(" "), (blockLen*2)+10))
f5.Write(search)
f5.Close()
fs = append(fs, f5)
for _, fp := range fs {
f, err := os.Open(fp.Name())
if err != nil {
log.Fatal(err)
}
defer f.Close()
n, err := backwardSearch{Len: 1024, F: f, Sep: []byte("\n")}.Find([]byte("find me"))
log.Println(fp.Name(), n, err)
f.Seek(n, 0)
check := make([]byte, 1024)
f.Read(check)
if !bytes.HasPrefix(check, search) {
panic("nop")
}
}
}
type backwardSearch struct {
Len int
Sep []byte
F *os.File
start int64
}
func (b backwardSearch) Find(search []byte) (int64, error) {
st, err := b.F.Stat()
if err != nil {
return 0, err
}
buf := make([]byte, b.Len, b.Len)
b.start = st.Size()
for {
log.Println("a start", b.start)
offset := b.start - int64(b.Len)
toread := b.Len
log.Println("b offset", offset, "toread", toread)
if offset < 0 {
toread += int(offset)
offset = 0
}
log.Println("c offset", offset, "toread", toread)
if toread == 0 {
return 0, io.EOF
}
log.Println("d offset", offset, "toread", toread)
if toread > int(b.start) {
toread = int(b.start)
}
log.Println("e offset", offset, "toread", toread)
b.F.Seek(offset, 0)
buf = buf[:b.Len]
n, err := b.F.Read(buf)
log.Println("f n", n, "err", err)
if n != toread {
panic("impossible..")
}
if err != nil {
return 0, err
}
buf = buf[:n]
b.start -= int64(toread)
log.Println("g start", b.start)
i := bytes.Index(buf, b.Sep)
log.Println("h sep", i)
if i > -1 {
b.start += int64(i)
buf = buf[i:]
}
log.Println("g start", b.start)
if e := bytes.Index(buf, search); e > -1 {
return b.start + int64(e), nil
}
}
return 0, nil
}
[请注意,我误解了您的问题,并认为这与从字符串中读取有关。我明天将更新此答案。