Skip to content

Commit

Permalink
Merge pull request #316 from Southclaws/restructure-ai-packages
Browse files Browse the repository at this point in the history
restructure gen/semdex features according to #314
  • Loading branch information
Southclaws authored Dec 14, 2024
2 parents a6f633a + 4256d5b commit 130e1b7
Show file tree
Hide file tree
Showing 44 changed files with 742 additions and 729 deletions.
1 change: 1 addition & 0 deletions api/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3077,6 +3077,7 @@ components:
InstanceCapability:
type: string
enum:
- gen_ai
- semdex
- email_client
- sms_client
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
// Package refhydrate provides a Semdexer implementation which wraps an instance
// of a RefSemdexer which will provide references for read-path methods instead
// of fully hydrated Storyden objects (Post, Node, etc.) The Semdexer provided
// by this package hydrates those references by looking them up in the database.
package refhydrate
// Package hydrate provides a generic datagraph item lookup conversion.
package hydrate

import (
"context"
Expand Down
2 changes: 2 additions & 0 deletions app/resources/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
collection_items "github.com/Southclaws/storyden/app/resources/collection/collection_item"
"github.com/Southclaws/storyden/app/resources/collection/collection_querier"
"github.com/Southclaws/storyden/app/resources/collection/collection_writer"
"github.com/Southclaws/storyden/app/resources/datagraph/hydrate"
"github.com/Southclaws/storyden/app/resources/event/event_querier"
"github.com/Southclaws/storyden/app/resources/event/event_writer"
"github.com/Southclaws/storyden/app/resources/event/participation/participant_querier"
Expand Down Expand Up @@ -92,6 +93,7 @@ func Build() fx.Option {
event_writer.New,
participant_querier.New,
participant_writer.New,
hydrate.New,
),
)
}
42 changes: 42 additions & 0 deletions app/services/generative/generative.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package generative

import (
"context"

"go.uber.org/fx"

"github.com/Southclaws/storyden/app/resources/datagraph"
"github.com/Southclaws/storyden/app/resources/tag/tag_ref"
"github.com/Southclaws/storyden/internal/infrastructure/ai"
)

type Tagger interface {
SuggestTags(ctx context.Context, content datagraph.Content, available tag_ref.Names) (tag_ref.Names, error)
}

type Summariser interface {
Summarise(ctx context.Context, object datagraph.Item) (string, error)
}

var (
_ Tagger = &generator{}
_ Summariser = &generator{}
)

type generator struct {
prompter ai.Prompter
}

func newGenerator(prompter ai.Prompter) *generator {
return &generator{prompter: prompter}
}

func Build() fx.Option {
return fx.Provide(
fx.Annotate(
newGenerator,
fx.As(new(Tagger)),
fx.As(new(Summariser)),
),
)
}
48 changes: 48 additions & 0 deletions app/services/generative/summary.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package generative

import (
"context"
"html/template"
"strings"

"github.com/Southclaws/fault"
"github.com/Southclaws/fault/fctx"

"github.com/Southclaws/storyden/app/resources/datagraph"
)

var SummarisePrompt = template.Must(template.New("").Parse(`
Write a short few paragraphs that are somewhat engaging but remaining relatively neutral in tone in the style of a wikipedia introduction about "{{ .Name }}". Focus on providing unique insights and interesting details while keeping the tone conversational and approachable. Imagine this will be read by someone browsing a directory or knowledgebase.
Be aware that the input to this may include broken HTML and other artifacts from the web and due to the nature of web scraping, there may be parts that do not make sense.
- Ignore any HTML tags, malformed content, or text that does not contribute meaningfully to the main topic.
- Based on the clear and coherent sections of the input, write short but engaging paragraphs. If the input lacks meaningful context, produce a neutral placeholder.
- If the input content is too fragmented or lacks sufficient context to produce a coherent response, produce a neutral placeholder.
- Do not describe the appearance of the input (e.g., broken HTML or artifacts). Instead, infer the main idea or purpose and expand on it creatively.
- If key parts of the content are missing or ambiguous, use creativity to fill gaps while maintaining relevance to the topic.
Output Format: Provide the output as a correctly formatted HTML document, you are free to use basic HTML formatting tags for emphasis, lists and headings. However, do not include the content title as a <h1> tag at the top. Start with a paragraph block immediately.
Content:
{{ .Content }}
`))

func (g *generator) Summarise(ctx context.Context, object datagraph.Item) (string, error) {
template := strings.Builder{}
err := SummarisePrompt.Execute(&template, map[string]any{
"Name": object.GetName(),
"Content": object.GetContent().Plaintext(),
})
if err != nil {
return "", fault.Wrap(err, fctx.With(ctx))
}

result, err := g.prompter.Prompt(ctx, template.String())
if err != nil {
return "", fault.Wrap(err, fctx.With(ctx))
}

return result.Answer, nil
}
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
package weaviate_semdexer
package generative

import (
"context"
"html/template"
"strings"
"text/template"

"github.com/Southclaws/dt"
"github.com/Southclaws/fault"
"github.com/Southclaws/fault/fctx"
"github.com/Southclaws/storyden/app/resources/datagraph"
"github.com/Southclaws/storyden/app/resources/tag/tag_ref"
"github.com/samber/lo"
"github.com/weaviate/weaviate-go-client/v4/weaviate/graphql"
)

var SuggestTagsPrompt = template.Must(template.New("").Parse(`Analyze the provided content of \"{name}\" and generate relevant tags. Tags are either single words or multiple words separated only by a hyphen, no spaces.
var SuggestTagsPrompt = template.Must(template.New("").Parse(`Analyze the provided content and generate up to three relevant tags. Tags are either single words or multiple words separated only by a hyphen, no spaces.
It's very important that only tags that are relevant to the content are returned, any tags of low confidence MUST be omitted. Do not generate tags that are too vague or tags that are too specific and cannot easily be used in other contexts for other types of content. Generally avoid tags that are singular and not plural that too closely match phrases or words in the content.
Expand All @@ -30,7 +29,7 @@ Content:
{{ .Content }}
`))

func (s *weaviateRefIndex) SuggestTags(ctx context.Context, content datagraph.Content, available tag_ref.Names) (tag_ref.Names, error) {
func (g *generator) SuggestTags(ctx context.Context, content datagraph.Content, available tag_ref.Names) (tag_ref.Names, error) {
// cap the available tags at 50, we don't to blow out the prompt size limit.
sliced := lo.Splice(available, 50)

Expand All @@ -43,31 +42,12 @@ func (s *weaviateRefIndex) SuggestTags(ctx context.Context, content datagraph.Co
return nil, fault.Wrap(err, fctx.With(ctx))
}

prompt := strings.ReplaceAll(template.String(), "\n", `\n`)

gs := graphql.NewGenerativeSearch().SingleResult(prompt)

r, err := mergeErrors(s.wc.GraphQL().
Get().
WithClassName(s.cn.String()).
WithLimit(1).
WithGenerativeSearch(gs).
Do(ctx))
if err != nil {
return nil, fault.Wrap(err, fctx.With(ctx))
}

wr, err := mapResponseObjects(r.Data)
if err != nil {
return nil, fault.Wrap(err, fctx.With(ctx))
}

object, err := s.getFirstResult(wr)
result, err := g.prompter.Prompt(ctx, template.String())
if err != nil {
return nil, fault.Wrap(err, fctx.With(ctx))
}

strings := strings.Split(object.Additional.Generate.SingleResult, ", ")
strings := strings.Split(result.Answer, ", ")

tags := dt.Map(strings, func(s string) tag_ref.Name {
return tag_ref.NewName(s)
Expand Down
4 changes: 2 additions & 2 deletions app/services/library/node_semdex/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func (i *semdexer) index(ctx context.Context, id library.NodeID, summarise bool,
return fault.Wrap(err, fctx.With(ctx))
}

err = i.indexer.Index(ctx, node)
err = i.semdexMutator.Index(ctx, node)
if err != nil {
return fault.Wrap(err, fctx.With(ctx))
}
Expand Down Expand Up @@ -81,7 +81,7 @@ func (i *semdexer) getSummary(ctx context.Context, p datagraph.Item) (*datagraph
func (i *semdexer) deindex(ctx context.Context, id library.NodeID) error {
qk := library.NewID(xid.ID(id))

err := i.deleter.Delete(ctx, xid.ID(id))
err := i.semdexMutator.Delete(ctx, xid.ID(id))
if err != nil {
return fault.Wrap(err, fctx.With(ctx))
}
Expand Down
54 changes: 26 additions & 28 deletions app/services/library/node_semdex/node_semdex.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/Southclaws/storyden/app/resources/library/node_writer"
"github.com/Southclaws/storyden/app/resources/mq"
"github.com/Southclaws/storyden/app/resources/tag/tag_writer"
"github.com/Southclaws/storyden/app/services/generative"
"github.com/Southclaws/storyden/app/services/semdex"
"github.com/Southclaws/storyden/app/services/tag/autotagger"
"github.com/Southclaws/storyden/internal/config"
Expand Down Expand Up @@ -39,18 +40,17 @@ var (
)

type semdexer struct {
logger *zap.Logger
db *ent.Client
nodeQuerier *node_querier.Querier
nodeWriter *node_writer.Writer
indexQueue pubsub.Topic[mq.IndexNode]
deleteQueue pubsub.Topic[mq.DeleteNode]
indexer semdex.Indexer
deleter semdex.Deleter
retriever semdex.Retriever
summariser semdex.Summariser
tagger *autotagger.Tagger
tagWriter *tag_writer.Writer
logger *zap.Logger
db *ent.Client
nodeQuerier *node_querier.Querier
nodeWriter *node_writer.Writer
indexQueue pubsub.Topic[mq.IndexNode]
deleteQueue pubsub.Topic[mq.DeleteNode]
semdexMutator semdex.Mutator
semdexQuerier semdex.Querier
summariser generative.Summariser
tagger *autotagger.Tagger
tagWriter *tag_writer.Writer
}

func newSemdexer(
Expand All @@ -64,10 +64,9 @@ func newSemdexer(
nodeWriter *node_writer.Writer,
indexQueue pubsub.Topic[mq.IndexNode],
deleteQueue pubsub.Topic[mq.DeleteNode],
indexer semdex.Indexer,
deleter semdex.Deleter,
retriever semdex.Retriever,
summariser semdex.Summariser,
semdexMutator semdex.Mutator,
semdexQuerier semdex.Querier,
summariser generative.Summariser,
tagger *autotagger.Tagger,
tagWriter *tag_writer.Writer,
) {
Expand All @@ -76,18 +75,17 @@ func newSemdexer(
}

re := semdexer{
logger: l,
db: db,
nodeQuerier: nodeQuerier,
nodeWriter: nodeWriter,
indexQueue: indexQueue,
deleteQueue: deleteQueue,
indexer: indexer,
deleter: deleter,
retriever: retriever,
summariser: summariser,
tagger: tagger,
tagWriter: tagWriter,
logger: l,
db: db,
nodeQuerier: nodeQuerier,
nodeWriter: nodeWriter,
indexQueue: indexQueue,
deleteQueue: deleteQueue,
semdexMutator: semdexMutator,
semdexQuerier: semdexQuerier,
summariser: summariser,
tagger: tagger,
tagWriter: tagWriter,
}

lc.Append(fx.StartHook(func(hctx context.Context) error {
Expand Down
2 changes: 1 addition & 1 deletion app/services/library/node_semdex/reindexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func (r *semdexer) reindex(ctx context.Context, reindexThreshold time.Duration,
keepIDs := dt.Map(keep, func(p *ent.Node) xid.ID { return p.ID })
discardIDs := dt.Map(discard, func(p *ent.Node) xid.ID { return p.ID })

indexed, err := r.retriever.GetMany(ctx, uint(reindexChunk), keepIDs...)
indexed, err := r.semdexQuerier.GetMany(ctx, uint(reindexChunk), keepIDs...)
if err != nil {
return fault.Wrap(err, fctx.With(ctx))
}
Expand Down
47 changes: 47 additions & 0 deletions app/services/semdex/disabled.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package semdex

import (
"context"

"github.com/rs/xid"

"github.com/Southclaws/storyden/app/resources/datagraph"
"github.com/Southclaws/storyden/app/resources/pagination"
"github.com/Southclaws/storyden/app/services/search/searcher"
)

type Disabled struct{}

var _ Semdexer = &Disabled{}

func (*Disabled) Index(ctx context.Context, object datagraph.Item) error {
return nil
}

func (*Disabled) Delete(ctx context.Context, object xid.ID) error {
return nil
}

func (*Disabled) Search(ctx context.Context, q string, p pagination.Parameters, opts searcher.Options) (*pagination.Result[datagraph.Item], error) {
panic("semdex disabled: searcher switch bug")
}

func (*Disabled) SearchRefs(ctx context.Context, q string, p pagination.Parameters, opts searcher.Options) (*pagination.Result[*datagraph.Ref], error) {
panic("semdex disabled: searcher switch bug")
}

func (*Disabled) Recommend(ctx context.Context, object datagraph.Item) (datagraph.ItemList, error) {
return nil, nil
}

func (*Disabled) RecommendRefs(ctx context.Context, object datagraph.Item) (datagraph.RefList, error) {
return nil, nil
}

func (*Disabled) ScoreRelevance(ctx context.Context, object datagraph.Item, idx ...xid.ID) (map[xid.ID]float64, error) {
return nil, nil
}

func (*Disabled) GetMany(ctx context.Context, limit uint, ids ...xid.ID) (datagraph.RefList, error) {
return nil, nil
}
13 changes: 5 additions & 8 deletions app/services/semdex/index_job/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,7 @@ type indexerConsumer struct {
qthread pubsub.Topic[mq.IndexThread]
qreply pubsub.Topic[mq.IndexReply]

indexer semdex.Indexer
retriever semdex.Retriever
indexer semdex.Mutator
}

func newIndexConsumer(
Expand All @@ -48,8 +47,7 @@ func newIndexConsumer(
qreply pubsub.Topic[mq.IndexReply],
qprofile pubsub.Topic[mq.IndexProfile],

indexer semdex.Indexer,
retriever semdex.Retriever,
indexer semdex.Mutator,
) *indexerConsumer {
return &indexerConsumer{
l: l,
Expand All @@ -59,10 +57,9 @@ func newIndexConsumer(
accountQuery: accountQuery,
qnode: qnode,

qthread: qthread,
qreply: qreply,
indexer: indexer,
retriever: retriever,
qthread: qthread,
qreply: qreply,
indexer: indexer,
}
}

Expand Down
Loading

0 comments on commit 130e1b7

Please sign in to comment.