Skip to content

Commit

Permalink
Merge pull request #67 from boostcampaitech4lv23cv2/build
Browse files Browse the repository at this point in the history
[Release] Version 1.1.2
  • Loading branch information
0seob authored Feb 16, 2023
2 parents 3e263ae + 4cad21c commit ca7ccc8
Show file tree
Hide file tree
Showing 13 changed files with 666 additions and 84 deletions.
4 changes: 2 additions & 2 deletions Data/utils/dataset_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def dataset_split(path, name):
if __name__ == "__main__":
random.seed(2023)
np.random.seed(2023)
fish_path = '/opt/ml/data/fish/'
sashimi_path = '/opt/ml/data/sashimi/'
fish_path = '/opt/ml/data2/fish/'
sashimi_path = '/opt/ml/data2/sashimi/'
dataset_split(fish_path, 'fish')
dataset_split(sashimi_path, 'sashimi')
56 changes: 56 additions & 0 deletions Data/utils/pseudo_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pandas as pd
import argparse
import glob
import os
from datetime import datetime

def make_csv(data: str, path: str, date: str):
"""maek csv function
Args:
data (str): dataset
path (str): folder path to make csv
date (str): today date
"""
today_path = os.path.join(path, date)
files = os.listdir(today_path)
csv_path = os.path.join(path, data+'.csv')
df = pd.read_csv(csv_path)

for file in files:
file = file[:-4].split('_')
print(file)
if (int(file[3]) == 2 and int(file[2]) >= 70) or (int(file[3]) == 1 and int(file[2]) >= 80):
new_row = {'img_path': f"{date}/"+file[0]+'.jpg', 'categories_id': file[1]}
df = df.append(new_row, ignore_index=True)

df.to_csv(os.path.join(path, data+'.csv'), columns = ['img_path', 'categories_id'], index = False)

def rename_file(path: str, date: str):
"""rename file
Args:
path (str): folder path to make csv
date (str): today date
"""
today_path = os.path.join(path, date)
files = os.listdir(today_path)
for file in files:
f_split = file[:-4].split('_')
os.rename(os.path.join(today_path, file), os.path.join(today_path, f_split[0]+".jpg"))

if __name__ == "__main__":
today = datetime.today().strftime('%Y%m%d')

parser = argparse.ArgumentParser()
parser.add_argument('--fpath', type=str, default='/opt/ml/data2/fish', help='rename folder path')
parser.add_argument('--spath', type=str, default='/opt/ml/data2/sashimi', help='rename folder path')
parser.add_argument('--date', type=str, default=today, help='today date')

args = parser.parse_args()

make_csv('fish', args.fpath, args.date)
make_csv('sashimi', args.spath, args.date)

rename_file(args.fpath, args.date)
rename_file(args.spath, args.date)
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

## 🐟[FICV](https://ficv-74b93.web.app/) | 수산시장 치팅 방지를 위한 어종 분류 서비스🐠

https://user-images.githubusercontent.com/71117066/217698404-eb7bf826-11bc-412d-a945-e810d56f5369.mp4

https://user-images.githubusercontent.com/29935109/219285923-0bd00817-66f7-49bd-ac72-37aba273ab09.mp4

## 🕵️Members

Expand Down Expand Up @@ -93,10 +92,9 @@ https://user-images.githubusercontent.com/71117066/217698404-eb7bf826-11bc-412d-
## 💾 Datasets
- 이미지 구성 :
- AI HUB : 500,000장(광어, 우럭, 참돔, 감성돔, 돌돔)-> 각 100,000장
- Scraping Dataset(각 200장 이상)
- Scraping Dataset(각 300장 이상)
- 물고기(광어, 우럭, 참돔, 감성돔, 돌돔, 민어, 큰민어, 강도다리, 자바리, 능성어, 방어, 부시리)
- 회(광어, 우럭, 참돔, 민어, 점성어, 틸라피아, 연어, 참치, 방어)
- 11 class : Background, General trash, Paper, Paper pack, Metal, Glass, Plastic, Styrofoam, Plastic bag, Battery, Clothing
- 회(광어, 우럭, 참돔, 민어, 점성어, 틸라피아, 연어, 참치, 방어)
- annotation format : CSV(image root, label in each line)

## 🏔️Environments
Expand Down
31 changes: 27 additions & 4 deletions airflow/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
# Airflow

## GCP 준비
> Airflow 서버를 gcp에서 구동시키고 다른 계정의 gcp bucket과 aistages 서버에 연결하여 이미지를 옮길 수 있도록 설정하였습니다.
### GCP 준비
- https://medium.com/apache-airflow/a-simple-guide-to-start-using-apache-airflow-2-on-google-cloud-1811c2127445
- https://ruuci.tistory.com/6

## Docker compose를 이용하여 Airflow 설치
### Docker compose를 이용하여 Airflow 설치
- https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html
- https://mep997.tistory.com/27

## Google Cloud Connection
### Google Cloud Connection
- Connection Id = "원하는 이름"
- Connection Type = Google Cloud
- Keyfile Path = gcp에 업로드 해 놓은 keyfile 절대경로
- ex) /opt/airflow/dags/key.json
- 주의!) keyfile path, keyfile json, keyfile secret name은 중복하여 작성X
- key file을 생성한 서비스 계정의 권한은 storage object admin으로 설정해야 파일에 접근할 수 있다.

## SFTP server Connection
### SFTP server Connection
- Connection Id = "원하는 이름"
- Connection Type = SFTP
- Host = 연결할 서버의 IP 주소
- username = 서버의 username
- Port = sftp port 번호
- Extra = gcp에 업로드 해 놓은 ssh keyfile의 위치를 json 형식으로 입력
- ex) {"key_file": "/opt/airflow/dags/key}
- 참고자료: https://docs.aws.amazon.com/ko_kr/mwaa/latest/userguide/samples-ssh.html

### Dag 작성
- gcs to sftp: https://airflow.apache.org/docs/apache-airflow-providers-google/stable/operators/transfer/gcs_to_sftp.html
- execute bash command on sftp server: https://airflow.apache.org/docs/apache-airflow-providers-ssh/stable/_api/airflow/providers/ssh/operators/ssh/index.html
70 changes: 70 additions & 0 deletions airflow/bucket_image_delete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Example Airflow DAG for Google Cloud Storage to SFTP transfer operators.
"""
from __future__ import annotations

import os
from datetime import datetime
from pathlib import Path

from airflow import models
from airflow.providers.google.cloud.transfers.gcs_to_sftp import GCSToSFTPOperator
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
from airflow.operators.dummy import DummyOperator
from airflow.providers.ssh.operators.ssh import SSHOperator
from airflow.providers.ssh.hooks.ssh import SSHHook
from airflow.utils.trigger_rule import TriggerRule

ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID")
PROJECT_ID = os.environ.get("SYSTEM_TESTS_GCP_PROJECT")
DAG_ID = "gcs_to_sftp"

SFTP_CONN_ID = "aistages"
BUCKET_NAME = "user-data-cv13"
DESTINATION_PATH_FISH = "/opt/ml/data2/fish"
DESTINATION_PATH_SASHIMI = "/opt/ml/data2/sashimi"
GCS_FISH_DIR = "fish/*"
GCS_SASHIMI_DIR = "sashimi/*"
date = "{{ ds_nodash }}"
sshHook = SSHHook(ssh_conn_id=SFTP_CONN_ID)

with models.DAG(
DAG_ID,
schedule="@once",
start_date=datetime(2023, 2, 5),
catchup=False,
tags=["example", "gcs"],
) as dag:

fish_image_delete = GCSDeleteObjectsOperator(
task_id="fish-image-delete",
bucket_name='ficv_dataset',
prefix='fish/'
)

sashimi_image_delete = GCSDeleteObjectsOperator(
task_id="sashimi-image-delete",
bucket_name='ficv_dataset',
prefix='sashimi/'
)

(
fish_image_delete >> sashimi_image_delete
)
132 changes: 132 additions & 0 deletions airflow/data_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Example Airflow DAG for Google Cloud Storage to SFTP transfer operators.
"""
from __future__ import annotations

import os
from datetime import datetime
from pathlib import Path

from airflow import models
from airflow.providers.google.cloud.transfers.gcs_to_sftp import GCSToSFTPOperator
from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
from airflow.operators.dummy import DummyOperator
from airflow.providers.ssh.operators.ssh import SSHOperator
from airflow.providers.ssh.hooks.ssh import SSHHook
from airflow.utils.trigger_rule import TriggerRule

ENV_ID = os.environ.get("SYSTEM_TESTS_ENV_ID")
PROJECT_ID = os.environ.get("SYSTEM_TESTS_GCP_PROJECT")
DAG_ID = "ficv_pipeline"

SFTP_CONN_ID = "aistages"
BUCKET_NAME = "user-data-cv13"
DESTINATION_PATH_FISH = "/opt/ml/data2/fish"
DESTINATION_PATH_SASHIMI = "/opt/ml/data2/sashimi"
GCS_FISH_DIR = "fish/*"
GCS_SASHIMI_DIR = "sashimi/*"
date = "{{ ds_nodash }}"
sshHook = SSHHook(ssh_conn_id=SFTP_CONN_ID)

with models.DAG(
DAG_ID,
schedule="@once",
start_date=datetime(2023, 2, 5),
catchup=False,
tags=["example", "gcs"],
) as dag:

dummy = DummyOperator(
task_id = "start",
)

# today folder name
fish_mkdir_today = SSHOperator(
task_id="fish-mkdir-today",
command="mkdir -p {{params.fish_des}}/{{ ds_nodash }}",
ssh_hook=sshHook,
params={'fish_des': DESTINATION_PATH_FISH,
'date': date,
}
)

sashimi_mkdir_today = SSHOperator(
task_id="sashimi-mkdir-today",
command="mkdir -p {{params.sashimi_des}}/{{ ds_nodash }}",
ssh_hook=sshHook,
params={'sashimi_des': DESTINATION_PATH_SASHIMI,
'date': date,
}
)


# [START howto_operator_gcs_to_sftp_move_specific_files]
move_dir_fish = GCSToSFTPOperator(
task_id="fish-move-gsc-to-sftp",
sftp_conn_id=SFTP_CONN_ID,
source_bucket=BUCKET_NAME,
source_object=GCS_FISH_DIR,
destination_path=os.path.join(DESTINATION_PATH_FISH, date),
keep_directory_structure=False,
)

move_dir_sashimi = GCSToSFTPOperator(
task_id="sashimi-move-gsc-to-sftp",
sftp_conn_id=SFTP_CONN_ID,
source_bucket=BUCKET_NAME,
source_object=GCS_SASHIMI_DIR,
destination_path=os.path.join(DESTINATION_PATH_SASHIMI, date),
keep_directory_structure=False,
)

fish_image_delete = GCSDeleteObjectsOperator(
task_id="fish-image-delete",
bucket_name=BUCKET_NAME,
prefix='fish/'
)

sashimi_image_delete = GCSDeleteObjectsOperator(
task_id="sashimi-image-delete",
bucket_name=BUCKET_NAME,
prefix='sashimi/'
)

make_dataset = SSHOperator(
task_id="make-dataset",
command="python /opt/ml/final-project-level3-cv-13/Data/utils/pseudo_dataset.py --date=\"{{ ds_nodash }}\"",
ssh_hook=sshHook,
params={'sashimi_des': DESTINATION_PATH_SASHIMI,
'date': date,
}
)

kfold_dataset = SSHOperator(
task_id="kfold-dataset",
command="python /opt/ml/final-project-level3-cv-13/Data/utils/dataset_split.py",
ssh_hook=sshHook,
)



(
dummy >> fish_mkdir_today >> move_dir_fish >> fish_image_delete >> make_dataset,
dummy >> sashimi_mkdir_today >> move_dir_sashimi >> sashimi_image_delete >> make_dataset,
make_dataset >> kfold_dataset
)
Loading

0 comments on commit ca7ccc8

Please sign in to comment.