Refactor, cleanup debug logging and index type metadata found in feeds to distinguish between user, bot and rss feeds

pull/11/head
James Mills 3 months ago
parent eb2c6dfbc1
commit e8444538ad
Signed by: prologic
GPG Key ID: AC4C014F1440EBD6
  1. 8
      .gitignore
  2. 13
      cmd/getfeeds/main.go
  3. 118
      cmd/geturl/geturl.go
  4. 85
      cmd/hashurl/hashurl.go
  5. 6
      internal/crawl_task.go
  6. 3
      internal/entry.go
  7. 1
      internal/handlers.go
  8. 13
      internal/indexer.go
  9. 5
      internal/jobs.go
  10. 1
      internal/links.go
  11. 1
      internal/results.go
  12. 1
      internal/scrape_task.go
  13. 17
      internal/scraper.go
  14. 1
      internal/templates/_partials.html
  15. 1
      internal/utils.go

8
.gitignore vendored

@ -9,10 +9,6 @@
/data
/yarns
/getlinks
/geturl
/hashurl
/getfeeds
/cmd/yarns/yarns
/cmd/geturl/geturl
/cmd/hashurl/hashurl
/cmd/getlinks/getlinks
/cmd/getfeeds/getfeeds

@ -19,7 +19,8 @@ var (
)
const helpText = `
getlinks crawls a feed finding other feeds in mentions
getfeeds crawls the Twtxt/Yarn space from a starting feed finding other feeds
in mentions and display sll feeds found
`
func init() {
@ -60,7 +61,7 @@ func main() {
parseArgs()
if version {
fmt.Printf("index_archive version %s", yarns.FullVersion())
fmt.Printf("getfeeds version %s", yarns.FullVersion())
os.Exit(0)
}
@ -78,13 +79,13 @@ func main() {
os.Exit(1)
}
url := flag.Arg(0)
start := flag.Arg(0)
count := 0
feeds := internal.FindFeeds(url, nil)
feeds := internal.FindFeeds(start, nil)
for feed := range feeds {
fmt.Printf("found feed %s\n", feed)
fmt.Println(feed)
count++
}
fmt.Printf("found %d feeds\n", count)
log.Infof("found %d feeds", count)
}

@ -1,118 +0,0 @@
package main
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"git.mills.io/yarnsocial/yarns"
"git.mills.io/yarnsocial/yarns/internal"
sync "github.com/sasha-s/go-deadlock"
log "github.com/sirupsen/logrus"
flag "github.com/spf13/pflag"
)
var (
debug bool
version bool
path string
)
const helpText = `
geturl retrives a url from the database
`
func init() {
baseProg := filepath.Base(os.Args[0])
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: %s [options] path\n", baseProg)
fmt.Fprint(os.Stderr, helpText)
flag.PrintDefaults()
}
flag.BoolVarP(&debug, "debug", "d", false, "enable debug logging")
flag.BoolVarP(&version, "version", "v", false, "display version information")
flag.StringVarP(&path, "path", "p", "yarns.db", "path to yarns database")
}
func flagNameFromEnvironmentName(s string) string {
s = strings.ToLower(s)
s = strings.Replace(s, "_", "-", -1)
return s
}
func parseArgs() error {
for _, v := range os.Environ() {
vals := strings.SplitN(v, "=", 2)
flagName := flagNameFromEnvironmentName(vals[0])
fn := flag.CommandLine.Lookup(flagName)
if fn == nil || fn.Changed {
continue
}
if err := fn.Value.Set(vals[1]); err != nil {
return err
}
}
flag.Parse()
return nil
}
func main() {
parseArgs()
if version {
fmt.Printf("hash_url version %s", yarns.FullVersion())
os.Exit(0)
}
if debug {
log.SetLevel(log.DebugLevel)
} else {
log.SetLevel(log.InfoLevel)
// Disable deadlock detection in production mode
sync.Opts.Disable = true
}
if flag.NArg() < 1 {
flag.Usage()
os.Exit(1)
}
uri := flag.Arg(0)
db, err := internal.NewBitcaskStore(path)
if err != nil {
log.WithError(err).Fatal("error opening database")
}
if err := db.Merge(); err != nil {
log.WithError(err).Fatal("error merging store")
}
url, isNew, err := db.GetOrSetURL(uri)
if err != nil {
log.WithError(err).Fatal("error getting url")
}
log.Debugf("isNew: %t", isNew)
/*
hash := internal.HashURL(uri)
url, err := db.GetURL(hash)
if err != nil {
log.WithError(err).Fatal("error getting url")
}
*/
log.Debugf("found url %s with hash %s at %s", url.URL, url.Hash(), url.Key())
data, err := json.Marshal(url)
if err != nil {
log.WithError(err).Fatal("error marshaling json")
}
fmt.Println(string(data))
}

@ -1,85 +0,0 @@
package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"git.mills.io/yarnsocial/yarns"
"git.mills.io/yarnsocial/yarns/internal"
sync "github.com/sasha-s/go-deadlock"
log "github.com/sirupsen/logrus"
flag "github.com/spf13/pflag"
)
var (
debug bool
version bool
)
const helpText = `
hashurl hashes a url which is internally used to track feeds already visited
`
func init() {
baseProg := filepath.Base(os.Args[0])
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: %s [options] path\n", baseProg)
fmt.Fprint(os.Stderr, helpText)
flag.PrintDefaults()
}
flag.BoolVarP(&debug, "debug", "d", false, "enable debug logging")
flag.BoolVarP(&version, "version", "v", false, "display version information")
}
func flagNameFromEnvironmentName(s string) string {
s = strings.ToLower(s)
s = strings.Replace(s, "_", "-", -1)
return s
}
func parseArgs() error {
for _, v := range os.Environ() {
vals := strings.SplitN(v, "=", 2)
flagName := flagNameFromEnvironmentName(vals[0])
fn := flag.CommandLine.Lookup(flagName)
if fn == nil || fn.Changed {
continue
}
if err := fn.Value.Set(vals[1]); err != nil {
return err
}
}
flag.Parse()
return nil
}
func main() {
parseArgs()
if version {
fmt.Printf("hash_url version %s", yarns.FullVersion())
os.Exit(0)
}
if debug {
log.SetLevel(log.DebugLevel)
} else {
log.SetLevel(log.InfoLevel)
// Disable deadlock detection in production mode
sync.Opts.Disable = true
}
if flag.NArg() < 1 {
flag.Usage()
os.Exit(1)
}
url := flag.Arg(0)
hash := internal.HashURL(url)
fmt.Println(hash)
}

@ -103,7 +103,6 @@ func (t *CrawlTask) Run() error {
// Skip feeds we've already seen by URI
if _, ok := seen.LoadOrStore(feed, true); ok {
log.Debugf("already seen feed %s", feed)
continue
}
@ -112,8 +111,6 @@ func (t *CrawlTask) Run() error {
wg.Add(1)
go func(feed string) {
log.Debugf("started scraper for %s", feed)
defer func() {
metrics.Counter("crawler", "crawled").Inc()
atomic.AddInt64(&nCrawled, 1)
@ -131,17 +128,14 @@ func (t *CrawlTask) Run() error {
if newFeed {
uri := res.Twter().URI
hash := HashURL(uri)
log.Debugf("found new url %s with hash %s", uri, hash)
url, err := t.db.GetURL(hash)
if err != nil {
log.WithError(err).Errorf("error loading url %s", res.Twter().URI)
return
}
log.Debugf("url %s has key %s", url.URL, url.Key())
if url.DiscoveredAt.IsZero() {
url.DiscoveredAt = time.Now()
log.Debugf("setting DiscoveringAt to %s for %s with key %s", url.DiscoveredAt, url.URL, url.Key())
if err := t.db.SetURL(url.Hash(), url); err != nil {
log.WithError(err).Errorf("error updating url %s", url)
} else {

@ -10,7 +10,7 @@ import (
const DefaultSearchField = "_all"
var SearchFields = []string{DefaultSearchField,
"text", "conv", "subject", "tags", "links", "mentions", "nick", "feed", "author",
"type", "text", "conv", "subject", "tags", "links", "mentions", "nick", "feed", "author",
}
func IsValidSearchField(f string) bool {
@ -26,6 +26,7 @@ func IsValidSearchField(f string) bool {
type Entry struct {
twt types.Twt
Type string `json:"type"`
Text string `json:"text"`
Conv string `json:"conv"`
Subject string `json:"subject"`

@ -183,7 +183,6 @@ func (svr *Server) SearchHandler() httprouter.Handle {
return
}
ctx.Title = fmt.Sprintf("%s Twtxt Search", qs)
log.Debugf("qs: %s", qs)
p := SafeParseInt(req.FormValue("p"), 1)
f := req.FormValue("f")

@ -10,7 +10,6 @@ import (
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/analyzer/web"
"github.com/blevesearch/bleve/v2/search/query"
humanize "github.com/dustin/go-humanize"
log "github.com/sirupsen/logrus"
)
@ -118,6 +117,9 @@ func NewBleveIndexer(conf *Config) (Indexer, error) {
} else {
docMapping := bleve.NewDocumentMapping()
feedType := bleve.NewKeywordFieldMapping()
docMapping.AddFieldMappingsAt("type", feedType)
text := bleve.NewTextFieldMapping()
docMapping.AddFieldMappingsAt("text", text)
@ -192,26 +194,23 @@ func (i *bleveIndexer) Search(qs string, opts ...SearchOption) (results *SearchR
switch o.Type {
case QueryString:
q = bleve.NewQueryStringQuery(qs)
log.Debug("Query String")
case MatchQuery:
mq := bleve.NewMatchQuery(qs)
mq.SetField(o.Field)
mq.SetOperator(query.MatchQueryOperatorAnd)
mq.SetFuzziness(1)
q = mq
log.Debug("Match Query")
case TermQuery:
tq := bleve.NewTermQuery(qs)
tq.SetField(o.Field)
q = tq
log.Debug("Term Query")
default:
return nil, fmt.Errorf("unsupported query type: %s", o.Type)
}
req := bleve.NewSearchRequest(q)
req.Fields = []string{
"hash", "text", "conv", "created",
"type", "hash", "text", "conv", "created",
"nick", "feed", "author", "tags", "mentions", "avatar",
}
req.SortBy(o.Sort)
@ -219,9 +218,6 @@ func (i *bleveIndexer) Search(qs string, opts ...SearchOption) (results *SearchR
req.Size = i.conf.ResultsPerPage
req.From = req.Size * (o.Page - 1)
memoryNeeded := bleve.MemoryNeededForSearchResult(req)
log.Debugf("Memory needed for search request: %s", humanize.Bytes(memoryNeeded))
res, err := i.idx.Search(req)
if err != nil {
log.WithError(err).Error("error searching index")
@ -281,6 +277,7 @@ func (i *bleveIndexer) Search(qs string, opts ...SearchOption) (results *SearchR
result := Result{
ID: hit.ID,
Score: hit.Score,
Type: hit.Fields["type"].(string),
Conv: hit.Fields["conv"].(string),
Created: t,
Nick: hit.Fields["nick"].(string),

@ -100,18 +100,16 @@ func (job *ScrapeActiveFeedsJob) Run() {
}
if url.Dead {
log.Debugf("skipping dead feed %s: %s", url, url.LastError)
return nil
}
if url.Failure > maxFeedFailures {
log.Debugf("giving up on feed %s with > %d failures", url, maxFeedFailures)
return nil
}
lastScrapedAt := time.Since(url.LastScrapedAt)
log.Debugf(
log.Infof(
"rescraping %s (lastscraped=%s lasterror=%s fetchavg=%0.2f newavg=%0.2f)",
url, lastScrapedAt, url.LastError, url.FetchAvg, url.NewAvg,
)
@ -155,7 +153,6 @@ func (job *ScrapeBrokenFeedsJob) Run() {
}
if url.LastScrapedAt.IsZero() {
log.Debugf("adding %s to scraper queue (never scraped)", url)
recrawl(url)
return nil
}

@ -65,7 +65,6 @@ func FindFeeds(start string, proxy Proxy) chan string {
}
if visited {
log.Debugf("already visited %s", u)
return nil
}

@ -7,6 +7,7 @@ import (
type Result struct {
ID string
Type string
Hash string
Conv string
Nick string

@ -76,7 +76,6 @@ func (t *ScrapeTask) Run() error {
if newFeed {
uri := res.Twter().URI
hash := HashURL(uri)
log.Debugf("found new url %s with hash %s", uri, hash)
url, err := t.db.GetURL(hash)
if err != nil {
log.WithError(err).Errorf("error loading url %s", res.Twter().URI)

@ -73,7 +73,6 @@ func ScrapeAndIndex(feed string, seen *sync.Map, conf *Config, db Store, archive
)
url, newFeed, err := db.GetOrSetURL(feed)
log.Debugf("db.GetOrSetURL(%q): %#v %t %q", feed, url, newFeed, err)
if err != nil {
return false, nil, fmt.Errorf("error getting or setting url from db for %s: %w", feed, err)
}
@ -304,6 +303,21 @@ func Scrape(conf *Config, req ScrapeRequest) (*ScrapeResult, error) {
opts := mockFmtOpts{""}
// TODO: Add .Type() to go.yarn.social/types.TwtFile interface
// TODO: Add valid feed types go go.yarn.social/types somewhere...
feedType := ""
if v, ok := tf.Info().GetN("type", 0); ok {
s := strings.ToLower(v.Value())
switch s {
case "rss", "bot":
feedType = s
case "":
// feed is a normal user
default:
// unsupported feed type
}
}
for _, twt := range tf.Twts() {
tags := []string{}
for _, tag := range twt.Tags() {
@ -327,6 +341,7 @@ func Scrape(conf *Config, req ScrapeRequest) (*ScrapeResult, error) {
entry := Entry{
twt: twt,
Type: feedType,
Text: twt.FormatText(types.TextFmt, opts),
Conv: twt.Subject().Text(),
Tags: tags,

@ -118,6 +118,7 @@
Search by:
<nav>
<ul>
{{ with $.Type }}<li><a href="/search?q={{ . }}&t=term&f=type&s=created&s=_id">type:{{ . }}</a></li>{{ end }}
<li><a href="/search?q={{ $.Conv }}&t=term&f=conv&s=created&s=_id">conv:{{ $.Conv }}</a></li>
<li><a href="/search?q={{ $.Author }}&t=term&f=author&s=created&s=_id">author:{{ $.Author }}</a></li>
</ul>

@ -515,7 +515,6 @@ func roundDuration(d time.Duration) string {
case u > nanos:
return strconv.FormatUint(u/nanos, 10) + "ns"
}
log.Debugf("u: %d", u)
return "0s"
}

Loading…
Cancel
Save