forked from christianmarechal/searchfirstgoodcsvline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSearchFirstGoodCSVLine.py
85 lines (66 loc) · 2.22 KB
/
SearchFirstGoodCSVLine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 20:08:07 2018
@author: christian marechal
Recherche de la premiere ligne de csv interessante
correspondante a la regularite d un tableau exploitable
"""
import os
path = 'D:/ecomdataforgoodfr/PushMyGIT'
os.chdir(path)
import csv
def readData(fileName):
f = open(fileName, "r", encoding="utf-8")
data = f.readlines()
f.close()
return data
csvfile='test.csv'
csvlignes = readData(csvfile)
#recherche de la premiere regularite dans un ficchier cvs
#csvlignes = premieres lignes d un fichier csv
#limitemax = profondeur maximale de l exploration
#traceon
import pandas as pd
def searFirstCSVlineGood (csvlignes, limitemax, traceon):
premiereLigne=0
n = 0
separators = [',',';','\t']
lesTroisColonnes = ['A','B', 'C']
df = pd.DataFrame(columns=lesTroisColonnes)
frequence =pd.DataFrame(columns=['D'])
# 1) boucle de lecture des premieres lignes
for ligne in csvlignes:
tab = []
for x in separators:
co = ligne.count(x)
tab.append(co)
frequence.loc[n] = tab.index(max(tab))
df.loc[n] =tab
n = n + 1
if (n >= limitemax):
break
# 2) Analyse du resultat
# a-detection du separateur
frequence['nombre']= frequence.groupby('D')['D'].transform('count')
mx = max(frequence['nombre'])
Sepp = 0
for index, row in frequence.iterrows():
if (row['nombre']==mx):
Sepp=row['D'] # on prend le dernier
df['nombre']= df.groupby(lesTroisColonnes[Sepp])[lesTroisColonnes[Sepp]].transform('count')
mx = max(df['nombre'])
# b- determination de la premiere ligne
for index, row in df.iterrows():
if (row['nombre']==mx):
premiereLigne=index # choix de ka premiere ligne
break
if (traceon):
print (df)
spar=[",", ";","tabulation"]
print("Separateur=["+spar[Sepp]+"], premiere ligne="+str(premiereLigne))
#print(">>"+csvlignes[premiereLigne])
return (premiereLigne)
limitemax=20
traceon=True
numero = searFirstCSVlineGood (csvlignes, limitemax, traceon)
print("Premiere ligne->"+str(numero))