working on app

dkohlsdorf · Aug 13, 2023 · ec7221c · ec7221c
1 parent cf70408
commit ec7221c
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 23 deletions.
diff --git a/app.py b/app.py
@@ -11,7 +11,7 @@
 from flask import Flask, render_template, flash, redirect, request
 
 
-VERSION     = 'sep_2022' 
+VERSION     = 'no_echo' 
 SEQ_PATH    = f'../web_service/{VERSION}/sequences/'
 IMG_PATH    = f'../web_service/{VERSION}/images/'
 PKL_PATH    = f'../web_service/{VERSION}/service.pkl'
@@ -162,12 +162,35 @@ def discovery():
     return render_template('discovery.html', sequences=sequences, n=len(sequences), keys = s[2])
 
 
+@app.route('/query_relaxed', methods=['POST'])
+@flask_login.login_required
+def upload_relaxed():
+    if request.method == 'POST':
+        print(request.files)
+        if 'file' not in request.files:
+            flash('No File Uploaded')
+            return redirect('/discovery')            
+        file = request.files['file']
+        if not file.filename.endswith('.wav'):
+            flash('Only wav files are allowed')
+            return redirect('/discovery')
+        path = f"{UPLOAD_PATH}/{file.filename}"
+        file.save(path)
+        print("Done Upload")
+        img, decoding, nn, keys = DISCOVERY.query_by_file(path, True)
+        decoding = " ".join(decoding)
+
+        history = QueryHistory()
+        history.insert(decoding, path.split('/')[-1])
+
+        sequences = [process_sequence(x) for x in nn]        
+        return render_template('discovery.html', sequences=sequences, n=len(sequences), keys = keys, query=(img, decoding))
+
 @app.route('/query', methods=['POST'])
 @flask_login.login_required
 def upload():
     if request.method == 'POST':
         print(request.files)
-
         if 'file' not in request.files:
             flash('No File Uploaded')
             return redirect('/discovery')            
@@ -177,7 +200,8 @@ def upload():
             return redirect('/discovery')
         path = f"{UPLOAD_PATH}/{file.filename}"
         file.save(path)
-        img, decoding, nn, keys = DISCOVERY.query_by_file(path)
+        print("Done Upload")
+        img, decoding, nn, keys = DISCOVERY.query_by_file(path, False)
         decoding = " ".join(decoding)
 
         history = QueryHistory()

diff --git a/decoder_worker.py b/decoder_worker.py
@@ -27,6 +27,7 @@
 
 from fastavro import writer, reader, parse_schema
 from scipy.io.wavfile import write
+
 ADDR       = 'localhost:50051' 
 VERSION    = 'no_echo' 
 SEQ_PATH   = f'../web_service/{VERSION}/sequences/'
@@ -170,7 +171,7 @@ def subsequences(sequence, max_len=8):
         for i in range(length, n):
             substring = " ".join([s['cls'] for s in sequence[i-length:i]])
             yield substring
-
+            
 
 class DiscoveryService:
 
@@ -181,6 +182,9 @@ def __init__(self, sequence_path, img_path, limit = None):
         self.decodings     = []
         self.encounter_ids = []
 
+        # TODO ts id extern index -> sequence 
+        self.inverted_idx = {}
+
         self.densities  = {}       
         self.neighbors  = {}
         self.substrings = {}
@@ -194,6 +198,7 @@ def __init__(self, sequence_path, img_path, limit = None):
         self.parse(sequence_path, limit)
         self.setup_discovery()
         self.setup_substrings()        
+        self.setup_inverted()
         self.sequence_path = sequence_path
         self.img_path = img_path    
 
@@ -202,8 +207,9 @@ def init_model(self, model_path):
         self.lab           = pkl.load(open(f"{model_path}/labels.pkl", "rb"))
         self.reverse       = {v:k for k, v in self.lab.items()}
         self.label_mapping = pkl.load(open(f'{model_path}/label_mapping.pkl', 'rb'))
+        load(ADDR, VERSION)
 
-    def parse(self, sequence_path, limit):        
+    def parse(self, sequence_path, limit):            
         for file in os.listdir(sequence_path):
             eid = file.replace('.avro', '')
             print(f" ... reading: {file} {eid}")
@@ -214,7 +220,7 @@ def parse(self, sequence_path, limit):
                     avro_reader = reader(fo)
                     for record in avro_reader:
                         self.sequences.append(record)
-                        self.encounter_ids.append(eid)
+                        self.encounter_ids.append(eid)                        
 
     def setup_substrings(self):
         for i, sequence in enumerate(self.sequences):
@@ -223,7 +229,12 @@ def setup_substrings(self):
                 if sub not in self.substrings:
                     self.substrings[sub] = []
                 self.substrings[sub].append(i)
-
+
+    def setup_inverted(self):
+        for i, sequence in enumerate(self.sequences):
+            for ts_id in sequence['proba_ids']:
+                self.inverted_idx[ts_id] = i 
+
     def setup_discovery(self):
         for key, sequence in enumerate(self.sequences):
             decoded = [DecodedSymbol.from_dict(x) for x in sequence['sequence']]            
@@ -259,8 +270,8 @@ def sample(self):
         keys = [neighbor for _, neighbor in self.neighbors[region]]
         nn   = [self.sequences[neighbor] for neighbor in keys]
         return self.sequences[region], nn, keys
-
-    def query_by_file(self, filename):
+    
+    def query_by_file(self, filename, relax=False):
         name = str(filename).split('/')[-1].split('.')[0]             
         query_id = f"query_{name}"
         audio = raw(filename)
@@ -274,26 +285,27 @@ def query_by_file(self, filename):
 
         n = len(probs)
         probas = []
-        for i in range(100, n, 50):
-            probas.append(probs[i-100:i])
-        ids = insert_all(probas, ADDR)
-
+        for i in range(100, n, 10):
+            probas.append(probs[i-100:i])    
+
         records = [{                
             "path":      name,
             "start":     start_bound,
             "stop":      stop_bound,
             "sequence":  [token.to_dict() for token in c],
-            "proba_ids": ids
+            "proba_ids": []
         }]                                               
         with open(f'{self.sequence_path}/{query_id}.avro', 'wb') as out:
             writer(out, SCHEMA, records)
 
         decoded = [DecodedSymbol.from_dict(x) for x in records[0]['sequence']]
-        neighbors = query(decoded, self.decodings, self.db)
+        if relax:
+            neighbors = find_relaxed(ADDR, VERSION, probas, self.inverted_idx)
+        else:
+            neighbors = query(decoded, self.decodings, self.db)
         keys = [neighbor for _, neighbor in neighbors]
         nn   = [self.sequences[neighbor] for neighbor in keys]        
-        return f"{query_id}.png", [s.cls for s in decoded], nn, keys
-
+        return f"{query_id}.png", [s.cls for s in decoded], nn, keys        
 
     def get(self, region):
         keys = [neighbor for _, neighbor in self.neighbors[region]]

diff --git a/lib_dolphin/extern_index.py b/lib_dolphin/extern_index.py
@@ -7,6 +7,8 @@
 import indexing_pb2
 import indexing_pb2_grpc
 
+from collections import defaultdict
+
 SIL = True
 
 def timeseries(ts):
@@ -43,9 +45,33 @@ def reindex(addr, name):
         response = stub.save(indexing_pb2.SaveIndexRequest(name = "name"))
         print("saving done")
 
-        
+
 def load(addr, name):
     with grpc.insecure_channel(addr) as channel:
-        stub = indexing_pb2_grpc.TimeSeriesServiceStub(channel)
+        stub = indexing_pb2_grpc.TimeSeriesServiceStub(channel)        
         response = stub.load(indexing_pb2.LoadIndexRequest(name = name))
 
+
+def find_relaxed(addr, name, sequences, inverted_idx, k = 10):
+    with grpc.insecure_channel(addr) as channel:
+        stub = indexing_pb2_grpc.TimeSeriesServiceStub(channel)        
+        not_there = 0
+        there = 0
+        found = defaultdict(int)
+        not_found = set()
+        for sequence in sequences:
+            query = timeseries(sequence)
+            response = stub.query(query)
+            for neighbor in response.ids:
+                if neighbor not in inverted_idx:
+                    not_there += 1
+                    not_found.add(neighbor)
+                else:
+                    i = inverted_idx[neighbor]
+                    found[i] += 1
+                    there += 1
+            neighbors = sorted(found.items(), key=lambda x: -x[1])[:k]
+
+    print(f"{there} / {not_there}: {len(not_found)}")
+    print(not_found)
+    return [(1.0 / n, k) for k, n in neighbors]
diff --git a/templates/discovery.html b/templates/discovery.html
@@ -40,11 +40,18 @@ <h3> ERRORS: <h3>
     </tr>
     <tr>         
         <td>
-            <form action='/query' method='post' enctype='multipart/form-data'>
-                <input type="file" name="file">
-                <input type='submit' value='Upload'/>
-            </form> 
+          Relaxed: <form action='/query_relaxed' method='post' enctype='multipart/form-data'>
+            <input type="file" name="file"> 
+            <input type='submit' value='Upload'/>
+          </form> 
+        </td> 
+        <td>
+          Constrained: <form action='/query' method='post' enctype='multipart/form-data'>
+            <input type="file" name="file"> 
+            <input type='submit' value='Upload'/>
+          </form> 
         </td> 
+
     </tr>
     <tr>                 
         <td>