Get attachment headers asynchronously and improve test data

giuseppefutia · giuseppefutia · commit dc642924a90f · 2021-03-22T17:51:41.000+01:00
diff --git a/check.py b/check.py
@@ -1,18 +1,18 @@
+import asyncio
+import httpx
 import re
-import read
-
-import os
 import requests
-from urllib.parse import urlparse
+import os
+
+import read
 
 """
 Consistency check in the files
 """
 
 # TODO: verificare se occorrono ulteriori controlli sul testo, ad esempio strim()
-# TODO: migliora la gestione degli attachments
 
-def validity(csv, doc):
+async def validity(csv, doc):
 
     # 'oggetto' and 'pec' are required
     valid, msg = valid_csv(csv)
@@ -28,7 +28,7 @@ def validity(csv, doc):
         return msg
     
     # check and download the attachments
-    valid, msg = valid_attachments(csv)
+    valid, msg = await valid_attachments(csv)
     if not valid:
         print(msg['text'])
         return msg
@@ -80,9 +80,10 @@ def valid_consistency(csv, doc):
     return valid, msg
 
 
-def valid_attachments(csv):
+async def valid_attachments(csv):
     print('\nCheck if the attachments are valid')
     msg = {}
+    msg['text'] = ''
     valid = True
 
     # Get all the attachments
@@ -97,19 +98,45 @@ def valid_attachments(csv):
         attachments[line] = line_attachments
         line+=1
     
-    # Check and download all the attachments
-    text_error = ''
-    for k, v in attachments.items():
-        urls = [u for u in v if 'http://' in u or 'https://' in u]
-        for url in urls:
-            url_text = urlparse(url)
-            name = os.path.basename(url_text.path)
-            
-            # Won't be able to identify the name file
-            if len(url) > 0 and name == '':
+    # Check all the attachments
+    for row, urls in attachments.items():
+        
+        # No empty 'allegato' fields
+        valid_urls = [i.strip() for i in urls if i.strip() != '']
+        
+        # Get headers (async way) and process
+        results = await task(valid_urls)
+        for r in results:
+            if type(r['response']) == httpx.Headers:
+                url = r['url']
+                ct = r['response']['content-type'].lower()
+                if 'text' in ct or 'html' in ct:
+                    valid = False
+                    msg['field'] = 'xlsx'
+                    msg['text'] += """Controlla l\'allegato {u} alla riga {r}. Il link non contiene un file.\n""".format(u=url, r=int(row)+1)
+            else:
+                url = r['url']
                 valid = False
-                mgs['field'] = 'xlsx'
-                text_error += """Controlla l\'allegato {u} alla riga {k}. Non sembra un file valido.\n""".format(u=url, k=k)
-                msg['text'] = text_error
+                msg['field'] = 'xlsx'
+                msg['text'] += """Controlla l\'allegato {u} alla riga {r}. Non sembra un link valido.\n""".format(u=url, r=int(row)+1)
     
     return valid, msg
+
+
+async def request_header(client, url):
+    print('    Get attachment header: %s' % url)
+    res = None
+    try:
+        response = await client.head(url)
+        res = response.headers
+    except Exception as err:
+        res = 'Errore'
+    
+    return {'url': url, 'response': res}
+
+
+async def task(urls):
+    async with httpx.AsyncClient() as client:
+        tasks = [request_header(client, url) for url in urls]
+        results = await asyncio.gather(*tasks)
+        return results
diff --git a/create.py b/create.py
@@ -17,19 +17,16 @@ def mails(csv, templ):
         # Recipient
         m['recipient'] = row['pec']
 
-        # Mail text
+        # Body
         doc = mail_text(row, templ)
         text_doc = get_text(doc)
         m['body'] = text_doc
 
         # Attachments
         m['attachments'] = []
-
         for k,v in row.items():
             if 'allegato' in k and v != '':
-                url_text = urlparse(v)
-                name = os.path.basename(url_text.path) 
-                m['attachments'].append(name)
+                m['attachments'].append(v)
 
         all_mails.append(m)
     
diff --git a/preview.html b/preview.html
@@ -1,15 +1,21 @@
 <div class="row">
-  <div class="col-sm-1"></div>
-  <div class="col-sm-10">
-    <p class="fw-bolder fs-1 text"> {{ subject }} </p>
+  <div class="col-sm-2"></div>
+  <div class="col-sm-8">
+    <p class="fw-bolder fs-1 text"> Oggetto: {{ subject }} </p>
     <p class="fs-2 text"> Destinatario: {{ recipient }} </p>
     <div style="white-space: pre-line">
       <p class="font-monospace lh-lg fw-normal">{{ body }}</p>
     </div>
+    <p class="fs-3 text"> Allegati: </p>
+    <ul>
+      {%- for item in attachments -%}
+      <li><a href="{{ item }}">{{ item }}</a></li>
+      {%- endfor -%}
+    </ul>
     <form id="send">
       <button class="btn btn-primary" type="submit" formaction='/send'>Invia le mail</button>
       <button class="btn btn-primary" type="submit" formaction='/'>Modifica i file caricati</button>
     </form>
   </div>
-  <div class="col-sm-1"></div>
+  <div class="col-sm-2"></div>
 </div>
diff --git a/server.py b/server.py
@@ -35,13 +35,13 @@ async def load_files(request: Request,
     data.xlsx['content-type'] = xlsx_file.content_type
 
     # Process and manage the results
-    result, docx, xlsx = process_inputs(xlsx_byte, xlsx_file.content_type, docx_byte)
+    result, docx, xlsx = await process_inputs(xlsx_byte, xlsx_file.content_type, docx_byte)
 
     if result == 'OK':
         data.mails = create.mails(xlsx, docx)
         data.docx['file'] = docx
-        data.xlsx['file'] = xlsx   
-    
+        data.xlsx['file'] = xlsx
+
     return result
 
 
@@ -52,7 +52,8 @@ def prepare_preview(request: Request):
         'request': request,
         'subject': data.mails[0]['subject'],
         'recipient' : data.mails[0]['recipient'],
-        'body': data.mails[0]['body']
+        'body': data.mails[0]['body'],
+        'attachments': data.mails[0]['attachments']
     }
 
     return templates.TemplateResponse('preview.html', context=context)
@@ -70,14 +71,14 @@ async def massive_send(request: Request):
     return templates.TemplateResponse('results.html', context=context)
 
 
-def process_inputs(xlsx_byte, xlsx_content_type, docx_byte):
+async def process_inputs(xlsx_byte, xlsx_content_type, docx_byte):
     if xlsx_content_type == 'text/csv':
         csv_reader = read.read_csv(xlsx_byte)
     elif xlsx_content_type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
         csv_reader = read.read_xlsx(xlsx_byte)
     
     xlsx = read.read_csv_reader(csv_reader)
     docx = read.read_docx(docx_byte)
-    res = check.validity(xlsx, docx)
+    res = await check.validity(xlsx, docx)
 
     return res, docx, xlsx
diff --git a/test/file-sample_100kB.doc b/test/file-sample_100kB.doc
diff --git a/test/test.csv b/test/test.csv
@@ -1,2 +1,2 @@
-destinatario,oggetto,pec,nome_progetto,firma,allegato1,allegato2,allegato3
-Caruso,Oggetto della mail per Caruso,giuseppe.futia@gmail.com,Progetto di Caruso,Wikimedia Italia,https://file-examples-com.github.io/uploads/2017/02/file-sample_100kB.doc,,
+destinatario,oggetto,pec,nome_progetto,firma,allegato1,allegato2,allegato3,allegato4
+Caruso,Oggetto della mail per Caruso,giuseppe.futia@gmail.com,Progetto di Caruso,Wikimedia Italia,https://file-examples-com.github.io/uploads/2017/02/file-sample_100kB.doc,,,
diff --git a/test/test_bad_attachments.csv b/test/test_bad_attachments.csv
@@ -0,0 +1,2 @@
+destinatario,oggetto,pec,nome_progetto,firma,allegato1,allegato2,allegato3,allegato4
+Caruso,Oggetto della mail per Caruso,giuseppe.futia@gmail.com,Progetto di Caruso,Wikimedia Italia,https://file-examples-com.github.io/uploads/2017/02/file-sample_100kB.doc,https://www.googsdasdasle.it,ciao,http://www.google.it

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+destinatario,oggetto,pec,nome_progetto,firma,allegato1,allegato2,allegato3,allegato4`
	`2`	`+Caruso,Oggetto della mail per Caruso,giuseppe.futia@gmail.com,Progetto di Caruso,Wikimedia Italia,https://file-examples-com.github.io/uploads/2017/02/file-sample_100kB.doc,https://www.googsdasdasle.it,ciao,http://www.google.it`