diff --git a/go.mod b/go.mod index 8a053ee..a67dc7d 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ toolchain go1.21.3 require ( github.com/bits-and-blooms/bloom/v3 v3.6.0 github.com/dgraph-io/badger/v4 v4.2.0 + github.com/google/uuid v1.5.0 github.com/gorilla/handlers v1.5.2 github.com/julienschmidt/httprouter v1.3.0 github.com/nlnwa/gowarc v1.1.2 @@ -39,7 +40,6 @@ require ( github.com/golang/snappy v0.0.4 // indirect github.com/google/btree v1.1.2 // indirect github.com/google/flatbuffers v23.5.26+incompatible // indirect - github.com/google/uuid v1.5.0 // indirect github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect diff --git a/index/api.go b/index/api.go index 84e7cf5..7a443fb 100644 --- a/index/api.go +++ b/index/api.go @@ -106,3 +106,9 @@ type IdResponse interface { GetValue() string GetError() error } + +type ReportAPI interface { + GenerateReport(context.Context, Request) (*schema.Report, error) + ListReport(context.Context) ([]*schema.Report, error) + GetReport(context.Context, string) (*schema.Report, error) +} diff --git a/internal/tikvidx/api.go b/internal/tikvidx/api.go index e41d0a6..a93ca05 100644 --- a/internal/tikvidx/api.go +++ b/internal/tikvidx/api.go @@ -20,11 +20,14 @@ import ( "context" "fmt" "strings" + "time" + "github.com/google/uuid" "github.com/nlnwa/gowarcserver/index" "github.com/nlnwa/gowarcserver/internal/keyvalue" "github.com/nlnwa/gowarcserver/schema" "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/timestamppb" ) // iterator mimics tikv's internal iterator interface @@ -256,3 +259,112 @@ func (db *DB) Delete(ctx context.Context) error { return firstErr } + +func (db *DB) GenerateReport(ctx context.Context, request index.Request) (*schema.Report, error) { + taskId, err := uuid.NewV7() + if err != nil { + return nil, err + } + + startTime := time.Now() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + report := &schema.Report{ + Id: taskId.String(), + Meta: &schema.ReportMetadata{ + StartTime: timestamppb.New(startTime), + }, + } + key := keyvalue.KeyWithPrefix(taskId.String(), reportPrefix) + value, err := proto.Marshal(report) + if err != nil { + return nil, err + } + + err = db.client.Put(ctx, key, value) + if err != nil { + return nil, err + } + + // // Go routine that actually starts generating report + // go func() { + // ctx, cancel := context.WithCancel(context.Background()) + // defer cancel() + // db.tasks[taskId.String()] = cancel + + // func() { + // report.Meta.EndTime = timestamppb.New(time.Now()) + + // key := keyvalue.KeyWithPrefix(taskId.String(), reportPrefix) + + // }() + + // }() + + return report, nil +} + +// func (db *DB) Report(ctx context.Context, req index.Request, res chan<- index.ReportResponse) error { +// var it iterator +// var err error + +// key := keyvalue.SearchKeyWithPrefix(req, cdxPrefix) +// it, err = newIter(ctx, key, db.client, req) +// if err != nil { +// return err +// } +// if it == nil { +// close(res) +// return nil +// } + +// go func() { +// defer close(res) +// defer it.Close() + +// count := 0 + +// for it.Valid() { +// reportResponse := func() (reportResponse index.ReportResponse) { +// key := keyvalue.CdxKey(it.Key()) +// if !req.DateRange().Contains(key.Unix()) { +// return +// } +// cdx := new(schema.Cdx) +// if err := proto.Unmarshal(it.Value(), cdx); err != nil { +// reportResponse.Error = err +// } else if req.Filter().Eval(cdx) { +// reportResponse.Key = string(key) +// reportResponse.Value = cdx +// } +// return +// }() +// if reportResponse == (index.ReportResponse{}) { +// if err = it.Next(); err != nil { +// res <- index.ReportResponse{Error: err} +// break +// } +// continue +// } +// select { +// case <-ctx.Done(): +// res <- index.ReportResponse{Error: ctx.Err()} +// return +// case res <- reportResponse: +// if reportResponse.Error == nil { +// count++ +// } +// } +// if req.Limit() > 0 && count >= req.Limit() { +// break +// } +// if err = it.Next(); err != nil { +// res <- index.ReportResponse{Error: err} +// break +// } +// } +// }() +// return nil +// } diff --git a/internal/tikvidx/db.go b/internal/tikvidx/db.go index da9084a..712509f 100644 --- a/internal/tikvidx/db.go +++ b/internal/tikvidx/db.go @@ -34,9 +34,10 @@ import ( ) var ( - idPrefix = "i" - filePrefix = "f" - cdxPrefix = "c" + idPrefix = "i" + filePrefix = "f" + cdxPrefix = "c" + reportPrefix = "r" ) const delimiter = "_" @@ -46,6 +47,7 @@ type DB struct { batch chan index.Record done chan struct{} wg sync.WaitGroup + tasks map[string]context.CancelFunc } func NewDB(options ...Option) (db *DB, err error) { @@ -61,6 +63,7 @@ func NewDB(options ...Option) (db *DB, err error) { idPrefix = dbName + delimiter + idPrefix + delimiter filePrefix = dbName + delimiter + filePrefix + delimiter cdxPrefix = dbName + delimiter + cdxPrefix + delimiter + reportPrefix = dbName + delimiter + reportPrefix + delimiter ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() @@ -75,6 +78,7 @@ func NewDB(options ...Option) (db *DB, err error) { db = &DB{ client: client, done: done, + tasks: make(map[string]context.CancelFunc), } if opts.ReadOnly { diff --git a/schema/report.pb.go b/schema/report.pb.go new file mode 100644 index 0000000..07d7d50 --- /dev/null +++ b/schema/report.pb.go @@ -0,0 +1,500 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.32.0 +// protoc v4.25.2 +// source: report.proto + +package schema + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + durationpb "google.golang.org/protobuf/types/known/durationpb" + timestamppb "google.golang.org/protobuf/types/known/timestamppb" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type Report struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` + Query *ReportQuery `protobuf:"bytes,2,opt,name=query,proto3" json:"query,omitempty"` + Meta *ReportMetadata `protobuf:"bytes,3,opt,name=meta,proto3" json:"meta,omitempty"` + Data *ReportData `protobuf:"bytes,4,opt,name=data,proto3" json:"data,omitempty"` +} + +func (x *Report) Reset() { + *x = Report{} + if protoimpl.UnsafeEnabled { + mi := &file_report_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Report) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Report) ProtoMessage() {} + +func (x *Report) ProtoReflect() protoreflect.Message { + mi := &file_report_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Report.ProtoReflect.Descriptor instead. +func (*Report) Descriptor() ([]byte, []int) { + return file_report_proto_rawDescGZIP(), []int{0} +} + +func (x *Report) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +func (x *Report) GetQuery() *ReportQuery { + if x != nil { + return x.Query + } + return nil +} + +func (x *Report) GetMeta() *ReportMetadata { + if x != nil { + return x.Meta + } + return nil +} + +func (x *Report) GetData() *ReportData { + if x != nil { + return x.Data + } + return nil +} + +type ReportQuery struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Domain string `protobuf:"bytes,1,opt,name=domain,proto3" json:"domain,omitempty"` +} + +func (x *ReportQuery) Reset() { + *x = ReportQuery{} + if protoimpl.UnsafeEnabled { + mi := &file_report_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ReportQuery) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ReportQuery) ProtoMessage() {} + +func (x *ReportQuery) ProtoReflect() protoreflect.Message { + mi := &file_report_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ReportQuery.ProtoReflect.Descriptor instead. +func (*ReportQuery) Descriptor() ([]byte, []int) { + return file_report_proto_rawDescGZIP(), []int{1} +} + +func (x *ReportQuery) GetDomain() string { + if x != nil { + return x.Domain + } + return "" +} + +type ReportData struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // Number of targets + NumberOfTargets int32 `protobuf:"varint,1,opt,name=number_of_targets,json=numberOfTargets,proto3" json:"number_of_targets,omitempty"` + // Number of target captures + NumberOfTargetCaptures int64 `protobuf:"varint,2,opt,name=number_of_target_captures,json=numberOfTargetCaptures,proto3" json:"number_of_target_captures,omitempty"` + // Number of URLs + NumberOfUrls int64 `protobuf:"varint,3,opt,name=number_of_urls,json=numberOfUrls,proto3" json:"number_of_urls,omitempty"` + // Distribution of HTTP status codes + HttpStatusCodeCount map[int32]int32 `protobuf:"bytes,4,rep,name=http_status_code_count,json=httpStatusCodeCount,proto3" json:"http_status_code_count,omitempty" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"varint,2,opt,name=value,proto3"` + // Number of domains or hosts + NumberOfDomains int32 `protobuf:"varint,5,opt,name=number_of_domains,json=numberOfDomains,proto3" json:"number_of_domains,omitempty"` + // Size in bytes + BytesTotal int64 `protobuf:"varint,6,opt,name=bytes_total,json=bytesTotal,proto3" json:"bytes_total,omitempty"` + // Number of WARC files + NrFiles int32 `protobuf:"varint,7,opt,name=nr_files,json=nrFiles,proto3" json:"nr_files,omitempty"` +} + +func (x *ReportData) Reset() { + *x = ReportData{} + if protoimpl.UnsafeEnabled { + mi := &file_report_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ReportData) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ReportData) ProtoMessage() {} + +func (x *ReportData) ProtoReflect() protoreflect.Message { + mi := &file_report_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ReportData.ProtoReflect.Descriptor instead. +func (*ReportData) Descriptor() ([]byte, []int) { + return file_report_proto_rawDescGZIP(), []int{2} +} + +func (x *ReportData) GetNumberOfTargets() int32 { + if x != nil { + return x.NumberOfTargets + } + return 0 +} + +func (x *ReportData) GetNumberOfTargetCaptures() int64 { + if x != nil { + return x.NumberOfTargetCaptures + } + return 0 +} + +func (x *ReportData) GetNumberOfUrls() int64 { + if x != nil { + return x.NumberOfUrls + } + return 0 +} + +func (x *ReportData) GetHttpStatusCodeCount() map[int32]int32 { + if x != nil { + return x.HttpStatusCodeCount + } + return nil +} + +func (x *ReportData) GetNumberOfDomains() int32 { + if x != nil { + return x.NumberOfDomains + } + return 0 +} + +func (x *ReportData) GetBytesTotal() int64 { + if x != nil { + return x.BytesTotal + } + return 0 +} + +func (x *ReportData) GetNrFiles() int32 { + if x != nil { + return x.NrFiles + } + return 0 +} + +type ReportMetadata struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + // When the report started processing + StartTime *timestamppb.Timestamp `protobuf:"bytes,1,opt,name=start_time,json=startTime,proto3" json:"start_time,omitempty"` + // How long the report took to process + Duration *durationpb.Duration `protobuf:"bytes,2,opt,name=duration,proto3" json:"duration,omitempty"` + // When the report was finished processing or when error occured + EndTime *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=end_time,json=endTime,proto3" json:"end_time,omitempty"` + // Error message if any + Error string `protobuf:"bytes,4,opt,name=error,proto3" json:"error,omitempty"` +} + +func (x *ReportMetadata) Reset() { + *x = ReportMetadata{} + if protoimpl.UnsafeEnabled { + mi := &file_report_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ReportMetadata) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ReportMetadata) ProtoMessage() {} + +func (x *ReportMetadata) ProtoReflect() protoreflect.Message { + mi := &file_report_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ReportMetadata.ProtoReflect.Descriptor instead. +func (*ReportMetadata) Descriptor() ([]byte, []int) { + return file_report_proto_rawDescGZIP(), []int{3} +} + +func (x *ReportMetadata) GetStartTime() *timestamppb.Timestamp { + if x != nil { + return x.StartTime + } + return nil +} + +func (x *ReportMetadata) GetDuration() *durationpb.Duration { + if x != nil { + return x.Duration + } + return nil +} + +func (x *ReportMetadata) GetEndTime() *timestamppb.Timestamp { + if x != nil { + return x.EndTime + } + return nil +} + +func (x *ReportMetadata) GetError() string { + if x != nil { + return x.Error + } + return "" +} + +var File_report_proto protoreflect.FileDescriptor + +var file_report_proto_rawDesc = []byte{ + 0x0a, 0x0c, 0x72, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x13, + 0x67, 0x6f, 0x77, 0x61, 0x72, 0x63, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x73, 0x63, 0x68, + 0x65, 0x6d, 0x61, 0x1a, 0x1f, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, + 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, + 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xbe, 0x01, 0x0a, 0x06, 0x52, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x12, + 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x12, + 0x36, 0x0a, 0x05, 0x71, 0x75, 0x65, 0x72, 0x79, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x20, + 0x2e, 0x67, 0x6f, 0x77, 0x61, 0x72, 0x63, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x52, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x51, 0x75, 0x65, 0x72, 0x79, + 0x52, 0x05, 0x71, 0x75, 0x65, 0x72, 0x79, 0x12, 0x37, 0x0a, 0x04, 0x6d, 0x65, 0x74, 0x61, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x23, 0x2e, 0x67, 0x6f, 0x77, 0x61, 0x72, 0x63, 0x73, 0x65, + 0x72, 0x76, 0x65, 0x72, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x52, 0x65, 0x70, 0x6f, + 0x72, 0x74, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x52, 0x04, 0x6d, 0x65, 0x74, 0x61, + 0x12, 0x33, 0x0a, 0x04, 0x64, 0x61, 0x74, 0x61, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1f, + 0x2e, 0x67, 0x6f, 0x77, 0x61, 0x72, 0x63, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x52, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, + 0x04, 0x64, 0x61, 0x74, 0x61, 0x22, 0x25, 0x0a, 0x0b, 0x52, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x51, + 0x75, 0x65, 0x72, 0x79, 0x12, 0x16, 0x0a, 0x06, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x22, 0xb8, 0x03, 0x0a, + 0x0a, 0x52, 0x65, 0x70, 0x6f, 0x72, 0x74, 0x44, 0x61, 0x74, 0x61, 0x12, 0x2a, 0x0a, 0x11, 0x6e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x5f, 0x6f, 0x66, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, + 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x4f, 0x66, + 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x73, 0x12, 0x39, 0x0a, 0x19, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x5f, 0x6f, 0x66, 0x5f, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x5f, 0x63, 0x61, 0x70, 0x74, + 0x75, 0x72, 0x65, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x03, 0x52, 0x16, 0x6e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x4f, 0x66, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x43, 0x61, 0x70, 0x74, 0x75, 0x72, + 0x65, 0x73, 0x12, 0x24, 0x0a, 0x0e, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x5f, 0x6f, 0x66, 0x5f, + 0x75, 0x72, 0x6c, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0c, 0x6e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x4f, 0x66, 0x55, 0x72, 0x6c, 0x73, 0x12, 0x6d, 0x0a, 0x16, 0x68, 0x74, 0x74, 0x70, + 0x5f, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x5f, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x63, 0x6f, 0x75, + 0x6e, 0x74, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x38, 0x2e, 0x67, 0x6f, 0x77, 0x61, 0x72, + 0x63, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x52, + 0x65, 0x70, 0x6f, 0x72, 0x74, 0x44, 0x61, 0x74, 0x61, 0x2e, 0x48, 0x74, 0x74, 0x70, 0x53, 0x74, + 0x61, 0x74, 0x75, 0x73, 0x43, 0x6f, 0x64, 0x65, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x45, 0x6e, 0x74, + 0x72, 0x79, 0x52, 0x13, 0x68, 0x74, 0x74, 0x70, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x43, 0x6f, + 0x64, 0x65, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x12, 0x2a, 0x0a, 0x11, 0x6e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x5f, 0x6f, 0x66, 0x5f, 0x64, 0x6f, 0x6d, 0x61, 0x69, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x01, + 0x28, 0x05, 0x52, 0x0f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x4f, 0x66, 0x44, 0x6f, 0x6d, 0x61, + 0x69, 0x6e, 0x73, 0x12, 0x1f, 0x0a, 0x0b, 0x62, 0x79, 0x74, 0x65, 0x73, 0x5f, 0x74, 0x6f, 0x74, + 0x61, 0x6c, 0x18, 0x06, 0x20, 0x01, 0x28, 0x03, 0x52, 0x0a, 0x62, 0x79, 0x74, 0x65, 0x73, 0x54, + 0x6f, 0x74, 0x61, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x6e, 0x72, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x73, + 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x6e, 0x72, 0x46, 0x69, 0x6c, 0x65, 0x73, 0x1a, + 0x46, 0x0a, 0x18, 0x48, 0x74, 0x74, 0x70, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x43, 0x6f, 0x64, + 0x65, 0x43, 0x6f, 0x75, 0x6e, 0x74, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, + 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, + 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x05, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0xcf, 0x01, 0x0a, 0x0e, 0x52, 0x65, 0x70, 0x6f, + 0x72, 0x74, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x39, 0x0a, 0x0a, 0x73, 0x74, + 0x61, 0x72, 0x74, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, + 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, + 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x09, 0x73, 0x74, 0x61, 0x72, + 0x74, 0x54, 0x69, 0x6d, 0x65, 0x12, 0x35, 0x0a, 0x08, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, + 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x44, 0x75, 0x72, 0x61, 0x74, 0x69, + 0x6f, 0x6e, 0x52, 0x08, 0x64, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x35, 0x0a, 0x08, + 0x65, 0x6e, 0x64, 0x5f, 0x74, 0x69, 0x6d, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, + 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, + 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x07, 0x65, 0x6e, 0x64, 0x54, + 0x69, 0x6d, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x04, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x42, 0x26, 0x5a, 0x24, 0x67, 0x69, 0x74, + 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x6e, 0x6c, 0x6e, 0x77, 0x61, 0x2f, 0x67, 0x6f, + 0x77, 0x61, 0x72, 0x63, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x6d, + 0x61, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_report_proto_rawDescOnce sync.Once + file_report_proto_rawDescData = file_report_proto_rawDesc +) + +func file_report_proto_rawDescGZIP() []byte { + file_report_proto_rawDescOnce.Do(func() { + file_report_proto_rawDescData = protoimpl.X.CompressGZIP(file_report_proto_rawDescData) + }) + return file_report_proto_rawDescData +} + +var file_report_proto_msgTypes = make([]protoimpl.MessageInfo, 5) +var file_report_proto_goTypes = []interface{}{ + (*Report)(nil), // 0: gowarcserver.schema.Report + (*ReportQuery)(nil), // 1: gowarcserver.schema.ReportQuery + (*ReportData)(nil), // 2: gowarcserver.schema.ReportData + (*ReportMetadata)(nil), // 3: gowarcserver.schema.ReportMetadata + nil, // 4: gowarcserver.schema.ReportData.HttpStatusCodeCountEntry + (*timestamppb.Timestamp)(nil), // 5: google.protobuf.Timestamp + (*durationpb.Duration)(nil), // 6: google.protobuf.Duration +} +var file_report_proto_depIdxs = []int32{ + 1, // 0: gowarcserver.schema.Report.query:type_name -> gowarcserver.schema.ReportQuery + 3, // 1: gowarcserver.schema.Report.meta:type_name -> gowarcserver.schema.ReportMetadata + 2, // 2: gowarcserver.schema.Report.data:type_name -> gowarcserver.schema.ReportData + 4, // 3: gowarcserver.schema.ReportData.http_status_code_count:type_name -> gowarcserver.schema.ReportData.HttpStatusCodeCountEntry + 5, // 4: gowarcserver.schema.ReportMetadata.start_time:type_name -> google.protobuf.Timestamp + 6, // 5: gowarcserver.schema.ReportMetadata.duration:type_name -> google.protobuf.Duration + 5, // 6: gowarcserver.schema.ReportMetadata.end_time:type_name -> google.protobuf.Timestamp + 7, // [7:7] is the sub-list for method output_type + 7, // [7:7] is the sub-list for method input_type + 7, // [7:7] is the sub-list for extension type_name + 7, // [7:7] is the sub-list for extension extendee + 0, // [0:7] is the sub-list for field type_name +} + +func init() { file_report_proto_init() } +func file_report_proto_init() { + if File_report_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_report_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*Report); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_report_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ReportQuery); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_report_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ReportData); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_report_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ReportMetadata); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_report_proto_rawDesc, + NumEnums: 0, + NumMessages: 5, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_report_proto_goTypes, + DependencyIndexes: file_report_proto_depIdxs, + MessageInfos: file_report_proto_msgTypes, + }.Build() + File_report_proto = out.File + file_report_proto_rawDesc = nil + file_report_proto_goTypes = nil + file_report_proto_depIdxs = nil +} diff --git a/schema/report.proto b/schema/report.proto new file mode 100644 index 0000000..69f8fa6 --- /dev/null +++ b/schema/report.proto @@ -0,0 +1,56 @@ +syntax = "proto3"; + +package gowarcserver.schema; + +import "google/protobuf/timestamp.proto"; +import "google/protobuf/duration.proto"; + +option go_package = "github.com/nlnwa/gowarcserver/schema"; + +message Report { + // Unique ID for the report + string id = 1; + // The query used to generate the report + ReportQuery query = 2; + // Metadata about the report + ReportMetadata meta = 3; + // The actual report data + ReportData data = 4; +} + +// The query used to generate the report +message ReportQuery { + string url = 1; + google.protobuf.Timestamp from = 2; + google.protobuf.Timestamp to = 3; +} + +// The actual report data +message ReportData { + // Number of targets + int32 number_of_targets = 1; + // Number of target captures + int64 number_of_target_captures = 2; + // Number of unique URLs + int64 number_of_urls = 3; + // Distribution of HTTP status codes + map http_status_code_count = 4; + // Number of domains or hosts + int32 number_of_domains = 5; + // Size in bytes of the payloads (compressed or uncompressed) + int64 number_of_payload_bytes = 6; + // Number of WARC files + int32 number_of_files = 7; +} + +// Metadata about the report +message ReportMetadata { + // When the report started processing + google.protobuf.Timestamp start_time = 1; + // How long the report took to process + google.protobuf.Duration duration = 2; + // When the report was finished processing or when error occured + google.protobuf.Timestamp end_time = 3; + // Error message if any + string error = 4; +} diff --git a/server/api/api.go b/server/api/api.go index 246c531..6203449 100644 --- a/server/api/api.go +++ b/server/api/api.go @@ -18,7 +18,7 @@ package api import ( "fmt" - "net/http" + "net/url" "regexp" "strconv" "strings" @@ -26,7 +26,7 @@ import ( "github.com/nlnwa/gowarcserver/index" "github.com/nlnwa/gowarcserver/surt" "github.com/nlnwa/gowarcserver/timestamp" - "github.com/nlnwa/whatwg-url/url" + whatwgUrl "github.com/nlnwa/whatwg-url/url" ) const ( @@ -67,7 +67,7 @@ var outputs = []string{OutputCdxj, OutputJson} // CoreAPI implements a subset of https://pywb.readthedocs.io/en/latest/manual/cdxserver_api.html. type CoreAPI struct { Collection string - Url *url.Url + Url *whatwgUrl.Url DateRange *DateRange MatchType string Limit int @@ -78,11 +78,11 @@ type CoreAPI struct { Fields []string } -func (capi *CoreAPI) Uri() *url.Url { +func (capi *CoreAPI) Uri() *whatwgUrl.Url { return capi.Url } -func ClosestAPI(closest string, u *url.Url) SearchRequest { +func ClosestAPI(closest string, u *whatwgUrl.Url) SearchRequest { return SearchRequest{ CoreAPI: &CoreAPI{ Url: u, @@ -190,10 +190,8 @@ func contains(s []string, e string) bool { var schemeRegExp = regexp.MustCompile(`^[a-z][a-z0-9+\-.]+(:.*)`) // Parse parses the request r into a *CoreAPI. -func Parse(r *http.Request) (*CoreAPI, error) { +func Parse(query url.Values) (*CoreAPI, error) { var err error - query := r.URL.Query() - coreApi := new(CoreAPI) // currently the "cdx" does not accept collection as a query or param @@ -212,7 +210,7 @@ func Parse(r *http.Request) (*CoreAPI, error) { if !schemeRegExp.MatchString(urlStr) { urlStr = "http://" + urlStr } - u, err := url.Parse(urlStr) + u, err := whatwgUrl.Parse(urlStr) if err != nil { return nil, err } diff --git a/server/api/api_test.go b/server/api/api_test.go index fa030f2..b4d957e 100644 --- a/server/api/api_test.go +++ b/server/api/api_test.go @@ -2,7 +2,6 @@ package api import ( "errors" - "net/http" "net/url" "reflect" "testing" @@ -119,11 +118,8 @@ func TestParse(t *testing.T) { if test.query.closest != "" { query.Set("closest", test.query.closest) } - reqUrl.RawQuery = query.Encode() - testRequest := &http.Request{URL: reqUrl} - - got, err := Parse(testRequest) + got, err := Parse(query) if err != nil { if test.err == nil { t.Errorf("unexpected error: %s", err) diff --git a/server/coreserver/handler.go b/server/coreserver/handler.go index 4b67a3d..43a8533 100644 --- a/server/coreserver/handler.go +++ b/server/coreserver/handler.go @@ -42,12 +42,13 @@ type Handler struct { CdxAPI index.CdxAPI FileAPI index.FileAPI IdAPI index.IdAPI + ReportAPI index.ReportAPI StorageRefResolver loader.StorageRefResolver WarcLoader loader.WarcLoader } func (h Handler) search(w http.ResponseWriter, r *http.Request) { - coreAPI, err := api.Parse(r) + coreAPI, err := api.Parse(r.URL.Query()) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return @@ -97,7 +98,7 @@ type storageRef struct { } func (h Handler) listIds(w http.ResponseWriter, r *http.Request) { - coreAPI, err := api.Parse(r) + coreAPI, err := api.Parse(r.URL.Query()) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return @@ -166,7 +167,7 @@ func (h Handler) getStorageRefByURN(w http.ResponseWriter, r *http.Request) { } func (h Handler) listFiles(w http.ResponseWriter, r *http.Request) { - coreAPI, err := api.Parse(r) + coreAPI, err := api.Parse(r.URL.Query()) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return @@ -273,3 +274,34 @@ func parseStorageRef(ref string) (filename string, offset int64, err error) { } return } + +func (h Handler) generateReport(w http.ResponseWriter, r *http.Request) { + if err := r.ParseForm(); err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + coreAPI, err := api.Parse(r.Form) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + + ctx, cancel := context.WithCancel(r.Context()) + defer cancel() + + res, err := h.ReportAPI.GenerateReport(ctx, api.Request(coreAPI)) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + log.Error().Err(err).Msg("Failed to generate report") + return + } + + w.Header().Set("Content-Type", "application/json") + w.Header().Set("X-Content-Type-Options", "nosniff") + // w.Header().Set("Location", /report/) + w.WriteHeader(http.StatusAccepted) + err = json.NewEncoder(w).Encode(res) + if err != nil { + log.Err(err).Msg("Failed to encode generate report response") + } +} diff --git a/server/coreserver/routes.go b/server/coreserver/routes.go index 4444451..1544dd6 100644 --- a/server/coreserver/routes.go +++ b/server/coreserver/routes.go @@ -30,4 +30,16 @@ func Register(h Handler, r *httprouter.Router, mw func(http.Handler) http.Handle r.Handler("GET", pathPrefix+"/cdx", mw(http.HandlerFunc(h.search))) r.Handler("GET", pathPrefix+"/search", mw(http.HandlerFunc(h.search))) r.Handler("GET", pathPrefix+"/record/:urn", mw(http.HandlerFunc(h.loadRecordByUrn))) + + // Initiate report generating task + r.Handler("POST", pathPrefix+"/report", mw(http.HandlerFunc(h.generateReport))) + + // Cancel in progress report generating task + // r.Handler("DELETE", pathPrefix+"/report/:id", mw(http.HandlerFunc(h.cancelReport))) + + // Get report status + // r.Handler("GET", pathPrefix+"/report/:id", mw(http.HandlerFunc(h.getReport))) + + // List reports + // r.Handler("GET", pathPrefix+"/report", mw(http.HandlerFunc(h.listReports))) } diff --git a/server/warcserver/handler.go b/server/warcserver/handler.go index 471cf83..c08c03e 100644 --- a/server/warcserver/handler.go +++ b/server/warcserver/handler.go @@ -28,7 +28,7 @@ type Handler struct { } func (h Handler) index(w http.ResponseWriter, r *http.Request) { - coreAPI, err := api.Parse(r) + coreAPI, err := api.Parse(r.URL.Query()) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) return