Skip to content

Commit

Permalink
Merge pull request #318 from Southclaws/chunked-embeddings
Browse files Browse the repository at this point in the history
use content chunking for indexed items
  • Loading branch information
Southclaws authored Dec 19, 2024
2 parents 4613bb4 + 0536943 commit 6bde4e2
Show file tree
Hide file tree
Showing 15 changed files with 353 additions and 191 deletions.
131 changes: 131 additions & 0 deletions app/resources/datagraph/content.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,3 +301,134 @@ func getSummary(article readability.Article) string {

return short
}

// rough upper bound sentence size for most languages.
const roughMaxSentenceSize = 350

func (c Content) Split() []string {
r := []html.Node{}

// first, walk the tree for the top-most block-content nodes.
var walk func(n *html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch n.DataAtom {
case
atom.H1,
atom.H2,
atom.H3,
atom.H4,
atom.H5,
atom.H6,
atom.Blockquote,
atom.Pre,
atom.P:
r = append(r, *n)
// once split, exit out of this branch
return
}
}

if n.Type == html.TextNode {
// if the text node is empty, skip it.
if strings.TrimSpace(n.Data) == "" {
return
}

r = append(r, *n)
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(c.html)

// now, iterate these top level nodes and split any that are "too big"
chunks := chunksFromNodes(r, roughMaxSentenceSize)

return chunks
}

func chunksFromNodes(ns []html.Node, max int) []string {
chunks := []string{}

for _, n := range ns {
t := textfromnode(&n)
if len(t) > max {
// TODO: Split logic
chunks = append(chunks, splitearly(t, max)...)
} else {
chunks = append(chunks, t)
}
}

return chunks
}

func splitearly(in string, max int) []string {
var chunks []string
var split func(s string)
split = func(s string) {
upper := min(len(s), max) - 1
if upper == -1 {
// reached end of input stream
return
}

lower := upper / 2
boundary := upper
fallback := -1
outer:
for ; boundary > lower; boundary-- {
c := s[boundary]
switch c {
// very rudimentary sentence boundaries (latin only at the moment)
case '.', ';', '!', '?':
break outer
// worst case: no boundaries found, use the closest space
case ' ':
if fallback == -1 {
fallback = boundary
}
}
}

if boundary <= lower {
if fallback > -1 {
// worst case: no sent boundaries, split at fallback position.
boundary = fallback
} else {
// worst case: no fallback either (the input string was a solid
// block of text with no spaces or sentence boundaries.)
boundary = upper
}
}

left := strings.TrimSpace(s[:boundary])
right := strings.TrimSpace(s[boundary+1:])
chunks = append(chunks, left)

if len(right) > 0 {
split(right)
}
}
split(in)

return chunks
}

func textfromnode(n *html.Node) string {
var collect func(*html.Node, *strings.Builder)
collect = func(cc *html.Node, buf *strings.Builder) {
if cc.Type == html.TextNode {
buf.WriteString(cc.Data)
}
for c := cc.FirstChild; c != nil; c = c.NextSibling {
collect(c, buf)
}
}
buf := &strings.Builder{}
collect(n, buf)
return buf.String()
}
68 changes: 68 additions & 0 deletions app/resources/datagraph/content_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,71 @@ func TestNewRichText(t *testing.T) {
a.Equal(original, parsed)
})
}

func TestSplit(t *testing.T) {
r := require.New(t)
a := assert.New(t)

c, err := NewRichText(`<h1>heading</h1>
<p>Here's a paragraph. It's pretty neat.</p>
<p>Here's the rest of the text.</p>
<img src="http://image.com" />
<p>neat photo right?</p>
<p>This is quite a long post, the summary, should just be the first 128 characters rounded down to the nearest space.</p>`)
r.NoError(err)
r.NotNil(r)

ps := c.Split()
a.Len(ps, 5)
}

func TestSplitMinimal(t *testing.T) {
r := require.New(t)
a := assert.New(t)

c, err := NewRichText(`I&#39;ve tried everything, but for some reason it seems impossible to find datasets that includes a simple list of the councils in England sorted by their group, and a list of covid cases also sorted by councils. I&#39;m not British so it may be a lack of knowledge of how their government sites work.
Anyone know a place to find these?`)
r.NoError(err)
r.NotNil(r)

ps := c.Split()
a.Len(ps, 1)
}

func TestSplitLong(t *testing.T) {
r := require.New(t)
a := assert.New(t)

c, err := NewRichText(`<body>
<p>A very short paragraph.</p>
<p>A very long single paragraph parato Solis gemitus nefandam munus cupidisque luminis Fuge fuit vestra undis laudando in aristas Lorem markdownum trepidantum genetricis [late](http://hic.com/nunc), steterat delendaque summique non? Domum si cursus supremaque aeraque [manibus](http://www.hoc-quod.io/); Iovis illi fert Bacchum, vulneraque __Oleniae suis__ increpat. Sanguine raucis albet Martius infantemque est _parili multam_ auditaque Caencu inferior augent, vix dote telae volat nec horto sceleratus. Mirator ambage! _Tale_ hic Diomedis, arva sonum factas maxima et relicto Longa incustoditam fixus. Praestantissima operata ardere semine per formae quod: in accipe quamvis amoribus, aquis medio confido puer stridore et clamavit? Video invidiosa Glauci flaventibus funeribus _tenet viscera boumque_ tacuit mearum unda interea calorem poscunt primum. Materia et Ethemon attonitae pactaeque in auctor, furiosior miscuerat? Anguem Arctonque receptus ait vacuus vestigia vapore praetulit, moves? var compression = array + pretestVrmlLion.mediaNetbios(file(topologyFlash, 4), 625909); fat_encoding(4, 1); honeypot += cisc; tigerOutboxFile -= repository_memory_symbolic(skin_dlc(5, sampling_excel)); var floppySmtp = core(5, fiber_opengl_touchscreen, gigabitProxy + rdf_cycle_scrolling); In orare ut finitimi Cum essemus spisso inferias: quam post: __unus cum tendebat__, ad, ecquem. Putaret vulnera noxque de vadorum materiam nomen, nives inroravere equidem Hippason hic convivia: per. var reimage_pup_case = saas + 2; cache = delExecutableMotion.subdirectory_service_status(-3, nosqlBin, analog_thick_rte) - adf; var ddlKerning = sector / meme + 3; Inania classes te lavere; _decimo iter prominet_ Scirone, contra harena! Neu multaque vocatur raucum dux pia, fruges illam Cupidinis huius, corruit aurea crudelius structis. Velamina manebit, unda desint saetas, deteriora domumque: e haec Ceres. Sic duro qua quidque victi Ino non, valentes fuit. Ventisque habeto [aliter feto vinci](http://partibus.org/)! Tonitrumque fidas quaerensque proles temptatum citharae iuguli duae patrio coniugis est mea genus dominae, ut nisi [ignoro](http://letiferos.net/auratis-humili.php)? Aetatis lactantia; ad et per clivo cognovit pretium. Adorat Solem illa flumine nobis patriam auxiliare illa Theseus dubioque lunae nactus discedere obiecit, e. Where do I begin? Easy. There are lots of problems with current forms of government. Clearly Democracy is not working (see [https:&#x2F;&#x2F;www.youtube.com&#x2F;watch?v=QFgcqB8-AxE] for a pretty short summary of why). Based on the IQ distribution, we have that 50% of people are below 100 IQ. If you go to a university or other decently high skill job, think about the dumbest person at that job. They probably have a 110+ IQ. Now consider that over 50% of people are dumber than that idiot who you hate with all your being. He is probably your boss, that project manager who wants you to play hopscotch with strangers as a team building exercise. Imagine if people just like him and even more stupid could singlehandedly decide on who the next ruler of your country will be. Terrible.So clearly the next best option is to have the decision be made by a more intelligent group of people. Who better than Rabbis? They have studied all their lives, are genetically more likely to have a very high IQ, have shown immense dedication, work ethic, and pure intentions (aside from pricking the penis of male converts, not sure why they do that). It&#x27;s common for them to engage in debates and intellectual discussions with each other, and they are chosen by G-d as His favored people to lead the way forward for humanity.Imagine a society where they are able to choose amongst themselves. Personally I think it would be amazing. The person they choose doesn&#x27;t even have to be a Rabbi or Jewish at all, it could be some random kid. But we need to all trust in their judgment because it is the best one available to us. To keep things fresh it&#x27;s probably best to rotate different Rabbis every year, maybe have one year be Conservative, the next one be Reform, etc. just for the variety and to give them a break. Many of them are senior citizens, we don&#x27;t want them getting exhausted or accelerating neurological issues they might have.</p>
</body>
`)
r.NoError(err)
r.NotNil(r)

ps := c.Split()
r.Len(ps, 15)
a.Equal("A very short paragraph.", ps[0])
a.Equal("A very long single paragraph parato Solis gemitus nefandam munus cupidisque luminis Fuge fuit vestra undis laudando in aristas Lorem markdownum trepidantum genetricis [late](http://hic.com/nunc), steterat delendaque summique non? Domum si cursus supremaque aeraque [manibus](http://www.hoc-quod.io/)", ps[1])
a.Equal("Iovis illi fert Bacchum, vulneraque __Oleniae suis__ increpat. Sanguine raucis albet Martius infantemque est _parili multam_ auditaque Caencu inferior augent, vix dote telae volat nec horto sceleratus. Mirator ambage! _Tale_ hic Diomedis, arva sonum factas maxima et relicto Longa incustoditam fixus", ps[2])
a.Equal("Praestantissima operata ardere semine per formae quod: in accipe quamvis amoribus, aquis medio confido puer stridore et clamavit? Video invidiosa Glauci flaventibus funeribus _tenet viscera boumque_ tacuit mearum unda interea calorem poscunt primum. Materia et Ethemon attonitae pactaeque in auctor, furiosior miscuerat", ps[3])
a.Equal("Anguem Arctonque receptus ait vacuus vestigia vapore praetulit, moves? var compression = array + pretestVrmlLion.mediaNetbios(file(topologyFlash, 4), 625909); fat_encoding(4, 1); honeypot += cisc; tigerOutboxFile -= repository_memory_symbolic(skin_dlc(5, sampling_excel))", ps[4])
a.Equal("var floppySmtp = core(5, fiber_opengl_touchscreen, gigabitProxy + rdf_cycle_scrolling); In orare ut finitimi Cum essemus spisso inferias: quam post: __unus cum tendebat__, ad, ecquem. Putaret vulnera noxque de vadorum materiam nomen, nives inroravere equidem Hippason hic convivia: per. var reimage_pup_case = saas + 2; cache = delExecutableMotion", ps[5])
a.Equal("subdirectory_service_status(-3, nosqlBin, analog_thick_rte) - adf; var ddlKerning = sector / meme + 3; Inania classes te lavere; _decimo iter prominet_ Scirone, contra harena! Neu multaque vocatur raucum dux pia, fruges illam Cupidinis huius, corruit aurea crudelius structis. Velamina manebit, unda desint saetas, deteriora domumque: e haec Ceres", ps[6])
a.Equal("Sic duro qua quidque victi Ino non, valentes fuit. Ventisque habeto [aliter feto vinci](http://partibus.org/)! Tonitrumque fidas quaerensque proles temptatum citharae iuguli duae patrio coniugis est mea genus dominae, ut nisi [ignoro](http://letiferos.net/auratis-humili.php)? Aetatis lactantia; ad et per clivo cognovit pretium", ps[7])
a.Equal("Adorat Solem illa flumine nobis patriam auxiliare illa Theseus dubioque lunae nactus discedere obiecit, e. Where do I begin? Easy. There are lots of problems with current forms of government. Clearly Democracy is not working (see [https://www.youtube.com/watch?v=QFgcqB8-AxE] for a pretty short summary of why)", ps[8])
a.Equal("Based on the IQ distribution, we have that 50% of people are below 100 IQ. If you go to a university or other decently high skill job, think about the dumbest person at that job. They probably have a 110+ IQ. Now consider that over 50% of people are dumber than that idiot who you hate with all your being", ps[9])
a.Equal("He is probably your boss, that project manager who wants you to play hopscotch with strangers as a team building exercise. Imagine if people just like him and even more stupid could singlehandedly decide on who the next ruler of your country will be. Terrible", ps[10])
a.Equal("So clearly the next best option is to have the decision be made by a more intelligent group of people. Who better than Rabbis? They have studied all their lives, are genetically more likely to have a very high IQ, have shown immense dedication, work ethic, and pure intentions (aside from pricking the penis of male converts, not sure why they do", ps[11])
a.Equal("that). It's common for them to engage in debates and intellectual discussions with each other, and they are chosen by G-d as His favored people to lead the way forward for humanity.Imagine a society where they are able to choose amongst themselves. Personally I think it would be amazing", ps[12])
a.Equal("The person they choose doesn't even have to be a Rabbi or Jewish at all, it could be some random kid. But we need to all trust in their judgment because it is the best one available to us. To keep things fresh it's probably best to rotate different Rabbis every year, maybe have one year be Conservative, the next one be Reform, etc", ps[13])
a.Equal("just for the variety and to give them a break. Many of them are senior citizens, we don't want them getting exhausted or accelerating neurological issues they might have", ps[14])
}
4 changes: 4 additions & 0 deletions app/resources/datagraph/ref.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ func (r *Ref) GetKind() Kind {

type RefList []*Ref

func (a RefList) Len() int { return len(a) }
func (a RefList) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a RefList) Less(i, j int) bool { return a[i].Relevance > a[j].Relevance }

func NewRef(i Item) *Ref {
return &Ref{
ID: i.GetID(),
Expand Down
26 changes: 0 additions & 26 deletions app/services/collection/collection_read/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,38 +9,31 @@ import (
"github.com/Southclaws/dt"
"github.com/Southclaws/fault"
"github.com/Southclaws/fault/fctx"
"github.com/Southclaws/opt"
"github.com/rs/xid"

"github.com/Southclaws/storyden/app/resources/account"
"github.com/Southclaws/storyden/app/resources/collection"
"github.com/Southclaws/storyden/app/resources/collection/collection_querier"
"github.com/Southclaws/storyden/app/resources/library"
"github.com/Southclaws/storyden/app/resources/post"
"github.com/Southclaws/storyden/app/resources/profile"
"github.com/Southclaws/storyden/app/resources/rbac"
"github.com/Southclaws/storyden/app/resources/visibility"
"github.com/Southclaws/storyden/app/services/authentication/session"
"github.com/Southclaws/storyden/app/services/semdex"
)

type Hydrator struct {
logger *zap.Logger
querier *collection_querier.Querier
semdex semdex.RelevanceScorer
session *session.Provider
}

func New(
logger *zap.Logger,
querier *collection_querier.Querier,
semdex semdex.RelevanceScorer,
session *session.Provider,
) *Hydrator {
return &Hydrator{
logger: logger,
querier: querier,
semdex: semdex,
session: session,
}
}
Expand Down Expand Up @@ -94,24 +87,5 @@ func (r *Hydrator) GetCollection(ctx context.Context, qk collection.QueryKey) (*
return true
})

if acc, ok := session.Get(); ok && r.semdex != nil {
pro := profile.ProfileFromAccount(&acc)
ids := dt.Map(col.Items, func(i *collection.CollectionItem) xid.ID { return i.Item.GetID() })

scores, err := r.semdex.ScoreRelevance(ctx, pro, ids...)
if err != nil {
r.logger.Warn("failed to score relevance", zap.Error(err))
}

col.Items = dt.Map(col.Items, func(i *collection.CollectionItem) *collection.CollectionItem {
score, ok := scores[i.Item.GetID()]
if ok {
i.RelevanceScore = opt.New(score)
}

return i
})
}

return col, nil
}
24 changes: 0 additions & 24 deletions app/services/library/node_read/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,28 @@ import (

"github.com/Southclaws/fault"
"github.com/Southclaws/fault/fctx"
"github.com/Southclaws/opt"
"github.com/rs/xid"
"go.uber.org/zap"

"github.com/Southclaws/storyden/app/resources/library"
"github.com/Southclaws/storyden/app/resources/library/node_querier"
"github.com/Southclaws/storyden/app/resources/profile"
"github.com/Southclaws/storyden/app/services/authentication/session"
"github.com/Southclaws/storyden/app/services/semdex"
)

type HydratedQuerier struct {
logger *zap.Logger
session *session.Provider
nodereader *node_querier.Querier
scorer semdex.RelevanceScorer
}

func New(
logger *zap.Logger,
session *session.Provider,
nodereader *node_querier.Querier,
scorer semdex.RelevanceScorer,
) *HydratedQuerier {
return &HydratedQuerier{
logger: logger,
session: session,
nodereader: nodereader,
scorer: scorer,
}
}

Expand All @@ -53,22 +46,5 @@ func (q *HydratedQuerier) GetBySlug(ctx context.Context, qk library.QueryKey) (*
return nil, fault.Wrap(err, fctx.With(ctx))
}

if acc, ok := session.Get(); ok && q.scorer != nil {
pro := profile.ProfileFromAccount(&acc)
nid := xid.ID(n.Mark.ID())

scores, err := q.scorer.ScoreRelevance(ctx, pro, nid)
if err != nil {
q.logger.Warn("failed to score relevance", zap.Error(err))
}

score, ok := scores[nid]
if ok {
n.RelevanceScore = opt.New(score)
}

// TODO: Hydrate recommendations
}

return n, nil
}
5 changes: 0 additions & 5 deletions app/services/semdex/semdex.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ type Mutator interface {
type Querier interface {
Searcher
Recommender
RelevanceScorer

GetMany(ctx context.Context, limit uint, ids ...xid.ID) (datagraph.RefList, error)
}
Expand All @@ -38,7 +37,3 @@ type Recommender interface {
Recommend(ctx context.Context, object datagraph.Item) (datagraph.ItemList, error)
RecommendRefs(ctx context.Context, object datagraph.Item) (datagraph.RefList, error)
}

type RelevanceScorer interface {
ScoreRelevance(ctx context.Context, object datagraph.Item, idx ...xid.ID) (map[xid.ID]float64, error)
}
1 change: 0 additions & 1 deletion app/services/semdex/semdexer/semdexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ func Build() fx.Option {
fx.As(new(semdex.Querier)),
fx.As(new(semdex.Mutator)),
fx.As(new(semdex.Recommender)),
fx.As(new(semdex.RelevanceScorer)),
fx.As(new(semdex.Searcher)),
),
)
Expand Down
15 changes: 10 additions & 5 deletions app/services/semdex/semdexer/weaviate_semdexer/delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,20 @@ import (
"github.com/Southclaws/fault"
"github.com/Southclaws/fault/fctx"
"github.com/rs/xid"
"github.com/weaviate/weaviate-go-client/v4/weaviate/filters"
)

func (w *weaviateSemdexer) Delete(ctx context.Context, id xid.ID) error {
wid := GetWeaviateID(id)
delete := w.wc.Batch().
ObjectsBatchDeleter().
WithWhere(
filters.Where().
WithPath([]string{"datagraph_id"}).
WithOperator(filters.Equal).
WithValueString(id.String()),
)

err := w.wc.Data().Deleter().
WithClassName(string(w.cn)).
WithID(wid).
Do(ctx)
_, err := delete.Do(ctx)
if err != nil {
return fault.Wrap(err, fctx.With(ctx))
}
Expand Down
Loading

0 comments on commit 6bde4e2

Please sign in to comment.