diff --git a/.github/integration/tests/tests.sh b/.github/integration/tests/tests.sh index 449101cd..c57b35ac 100755 --- a/.github/integration/tests/tests.sh +++ b/.github/integration/tests/tests.sh @@ -149,31 +149,89 @@ else exit 1 fi -# Dataset size using a local urls_list.txt -echo "http://localhost:9000/download/A352764B-2KB4-4738-B6B5-BA55D25FB469/data_file.c4gh" > urls_list.txt +# Download file by using the sda download service +./sda-cli download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh + +# Check if file exists in the path +if [ ! -f "test-download/main/subfolder/dummy_data" ]; then + echo "Downloaded file not found" + exit 1 +fi + +# Check the first line of that file +first_line=$(head -n 1 test-download/main/subfolder/dummy_data) +if [[ $first_line != *"THIS FILE IS JUST DUMMY DATA"* ]]; then + echo "First line does not contain the expected string" + exit 1 +fi + +rm -r test-download + +# Check listing files in a dataset +output=$(./sda-cli list -config testing/s3cmd-download.conf -dataset https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080) +expected="dummy_data.c4gh 1048605 dummy_data2.c4gh 1048605 dummy_data3.c4gh 1048605" +if [[ "${output//[$' \t\n\r']/}" == "${expected//[$' \t\n\r']/}" ]]; then + echo "Successfully listed files in dataset" +else + echo "Failed to list files in dataset" + exit 1 +fi -s3cmd -c testing/directS3 put data_files_enc/data_file.c4gh s3://download/A352764B-2KB4-4738-B6B5-BA55D25FB469/data_file.c4gh -check_uploaded_file download/A352764B-2KB4-4738-B6B5-BA55D25FB469/data_file.c4gh data_file.c4gh +# Check listing datasets +output=$(./sda-cli list -config testing/s3cmd-download.conf --datasets -url http://localhost:8080) +expected="https://doi.example/ty009.sfrrss/600.45asasga" +if [[ $output == *"$expected"* ]]; then + echo "Successfully listed datasets" +else + echo "Failed to list datasets" + exit 1 +fi -s3cmd -c testing/directS3 put urls_list.txt s3://download/A352764B-2KB4-4738-B6B5-BA55D25FB469/urls_list.txt +# Download whole dataset by using the sda-download feature +./sda-cli download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-dataset --dataset + +filepaths="download-dataset/main/subfolder/dummy_data download-dataset/main/subfolder2/dummy_data2 download-dataset/main/subfolder2/random/dummy_data3" + +# Check if all the files of the dataset have been downloaded +for filepath in $filepaths; do + if [ ! -f "$filepath" ]; then + echo "File $filepath does not exist" + exit 1 + fi +done -# Download file with local urls_list.txt -./sda-cli download -outdir downloads urls_list.txt +rm -r download-dataset -if [ -f downloads/data_file.c4gh ]; then - echo "Downloaded data file" +# Download encrypted file by using the sda download service +# Create a user key pair +if ( yes "" | ./sda-cli createKey user_key ) ; then + echo "Created a user key pair for downloading encrypted files" else - echo "Failed to download data file" + echo "Failed to create a user key pair for downloading encrypted files" exit 1 fi +./sda-cli download -pubkey user_key.pub.pem -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh -# Decrypt file -C4GH_PASSWORD="" ./sda-cli decrypt -key sda_key.sec.pem downloads/data_file.c4gh +# check if file exists in the path +if [ ! -f "test-download/main/subfolder/dummy_data.c4gh" ]; then + echo "Downloaded file not found" + exit 1 +fi -if [ -f downloads/data_file ]; then - echo "Decrypted data file" +# decrypt the downloaded file +C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem test-download/main/subfolder/dummy_data.c4gh + +if [ -f test-download/main/subfolder/dummy_data ]; then + echo "Decrypting downloaded file succeeded" else - echo "Failed to decrypt data file" + echo "Failed to decrypt downloaded file" + exit 1 +fi + +# check the first line of that file +first_line=$(head -n 1 test-download/main/subfolder/dummy_data) +if [[ $first_line != *"THIS FILE IS JUST DUMMY DATA"* ]]; then + echo "First line does not contain the expected string" exit 1 fi @@ -194,7 +252,7 @@ done cat sda_key1.pub.pem sda_key2.pub.pem > sda_keys # Create test files -cp data_file data_file_keys +cp test-download/main/subfolder/dummy_data data_file_keys # Encrypt with multiple key flag calls ./sda-cli encrypt -key sda_key.pub.pem -key sda_key2.pub.pem data_file_keys @@ -252,166 +310,12 @@ done # Remove files used for encrypt and upload rm -r data_files_enc rm -r data_files_unenc -rm -r downloads -rm sda_key* checksum_* urls_list.txt data_file* - -# Dataset size using a url urls_list.txt -output=$(./sda-cli datasetsize http://localhost:9000/download/A352764B-2KB4-4738-B6B5-BA55D25FB469/urls_list.txt | grep -q "Total dataset size: 1.00MB") - -if $output; then - echo "Returned dataset size" -else - echo "Failed to return dataset size" - exit 1 -fi - -# Dataset size using a folder url -output=$(./sda-cli datasetsize http://localhost:9000/download/A352764B-2KB4-4738-B6B5-BA55D25FB469/ | grep -q "Total dataset size: 1.00MB") - -if $output; then - echo "Returned dataset size" -else - echo "Failed to return dataset size" - exit 1 -fi - -# Check that Download handles http responses with error status code - -# Try downloading nonexistent file -printf "%s" "Attempting to download a nonexistent file from S3..." -errorMessage="reason: request failed with \`404 Not Found\`, details: {Code:NoSuchKey" -if ./sda-cli download -outdir downloads http://localhost:9000/download/imaginary/path/ 2>&1 | grep -q "$errorMessage"; then - echo "bad download request handled properly" -else - echo "Failed to handle bad download request" - exit 1 -fi - -# Try downloading from private bucket -printf "%s" "Attempting to download from S3 bucket with ACL=private..." -errorMessage="reason: request failed with \`403 Forbidden\`, details: {Code:AllAccessDisabled" -if ./sda-cli download -outdir downloads http://localhost:9000/minio/test/"$user"/data_file1.c4gh 2>&1 | grep -q "$errorMessage"; then - echo "bad download request handled properly" -else - echo "Failed to handle bad download request" - exit 1 -fi - -# Download files using a folder url -./sda-cli download -outdir downloads http://localhost:9000/download/A352764B-2KB4-4738-B6B5-BA55D25FB469/ - -if [ -f downloads/data_file.c4gh ]; then - echo "Downloaded data file" -else - echo "Failed to download data file" - exit 1 -fi - -rm -r downloads - -# Download files using a url to urls_list.txt -./sda-cli download -outdir downloads http://localhost:9000/download/A352764B-2KB4-4738-B6B5-BA55D25FB469/urls_list.txt - -if [ -f downloads/data_file.c4gh ]; then - echo "Downloaded data file" -else - echo "Failed to download data file" - exit 1 -fi - -rm -r downloads - -# Download file by using the sda download service -./sda-cli sda-download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh - -# Check if file exists in the path -if [ ! -f "test-download/main/subfolder/dummy_data" ]; then - echo "Downloaded file not found" - exit 1 -fi - -# Check the first line of that file -first_line=$(head -n 1 test-download/main/subfolder/dummy_data) -if [[ $first_line != *"THIS FILE IS JUST DUMMY DATA"* ]]; then - echo "First line does not contain the expected string" - exit 1 -fi - -rm -r test-download - -# Check listing files in a dataset -output=$(./sda-cli list -config testing/s3cmd-download.conf -dataset https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080) -expected="dummy_data.c4gh 1048605 dummy_data2.c4gh 1048605 dummy_data3.c4gh 1048605" -if [[ "${output//[$' \t\n\r']/}" == "${expected//[$' \t\n\r']/}" ]]; then - echo "Successfully listed files in dataset" -else - echo "Failed to list files in dataset" - exit 1 -fi - -# Check listing datasets -output=$(./sda-cli list -config testing/s3cmd-download.conf --datasets -url http://localhost:8080) -expected="https://doi.example/ty009.sfrrss/600.45asasga" -if [[ $output == *"$expected"* ]]; then - echo "Successfully listed datasets" -else - echo "Failed to list datasets" - exit 1 -fi - -# Download whole dataset by using the sda-download feature -./sda-cli sda-download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-dataset --dataset - -filepaths="download-dataset/main/subfolder/dummy_data download-dataset/main/subfolder2/dummy_data2 download-dataset/main/subfolder2/random/dummy_data3" - -# Check if all the files of the dataset have been downloaded -for filepath in $filepaths; do - if [ ! -f "$filepath" ]; then - echo "File $filepath does not exist" - exit 1 - fi -done - -rm -r download-dataset - -# Download encrypted file by using the sda download service -# Create a user key pair -if ( yes "" | ./sda-cli createKey user_key ) ; then - echo "Created a user key pair for downloading encrypted files" -else - echo "Failed to create a user key pair for downloading encrypted files" - exit 1 -fi -./sda-cli sda-download -pubkey user_key.pub.pem -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir test-download main/subfolder/dummy_data.c4gh - -# check if file exists in the path -if [ ! -f "test-download/main/subfolder/dummy_data.c4gh" ]; then - echo "Downloaded file not found" - exit 1 -fi - -# decrypt the downloaded file -C4GH_PASSWORD="" ./sda-cli decrypt -key user_key.sec.pem test-download/main/subfolder/dummy_data.c4gh - -if [ -f test-download/main/subfolder/dummy_data ]; then - echo "Decrypting downloaded file succeeded" -else - echo "Failed to decrypt downloaded file" - exit 1 -fi - -# check the first line of that file -first_line=$(head -n 1 test-download/main/subfolder/dummy_data) -if [[ $first_line != *"THIS FILE IS JUST DUMMY DATA"* ]]; then - echo "First line does not contain the expected string" - exit 1 -fi - +rm sda_key* data_file* rm -r test-download # Download recursively a folder echo "Downloading content of folder" -./sda-cli sda-download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-folder --recursive main/subfolder2 +./sda-cli download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-folder --recursive main/subfolder2 folderpaths="download-folder/main/subfolder2/dummy_data2 download-folder/main/subfolder2/random/dummy_data3" @@ -423,10 +327,10 @@ for folderpath in $folderpaths; do fi done -rm -r download-folder +rm -r download-folder # Download file by providing the file id -./sda-cli sda-download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-fileid urn:neic:001-001 +./sda-cli download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-fileid urn:neic:001-001 # Check if file exists in the path if [ ! -f "download-fileid/main/subfolder/dummy_data" ]; then @@ -445,7 +349,7 @@ rm -r download-fileid # Download the file paths content of a text file echo "Downloading content of a text file" -./sda-cli sda-download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-from-file --from-file testing/file-list.txt +./sda-cli download -config testing/s3cmd-download.conf -dataset-id https://doi.example/ty009.sfrrss/600.45asasga -url http://localhost:8080 -outdir download-from-file --from-file testing/file-list.txt # Check if the content of the text file has been downloaded content_paths="download-from-file/main/subfolder/dummy_data download-from-file/main/subfolder2/dummy_data2" diff --git a/README.md b/README.md index e98bc4ac..6a562cce 100644 --- a/README.md +++ b/README.md @@ -173,22 +173,6 @@ will first encrypt all files in `` and t - If the flag `--force-overwrite` is used, the tool will overwrite any already existing file. - The cli will exit if the input has any un-encrypred files. To override that, use the flag `--force-unencrypted`. -## Get dataset size - -Before downloading a dataset or a specific file, the `sda-cli` tool allows for requesting the size of each file, as well as the whole dataset. In order to use this functionality, the tool expects as an argument a file containing the location of the files in the dataset. The argument can be one of the following: - -1. a URL to the file containing the locations of the dataset files -2. a URL to a folder containing the `urls_list.txt` file with the locations of the dataset files -3. the path to a local file containing the locations of the dataset files. - -Given this argument, the dataset size can be retrieved using the following command: - -```bash -./sda-cli datasetsize -``` - -where `urls_file` as described above. - ## List files The uploaded files can be listed using the `list` parameter. This feature returns all the files in the user's bucket recursively and can be executed using: @@ -208,51 +192,8 @@ If no config is given by the user, the tool will look for a previous login from ## Download -The SDA/BP archive enables for downloading files and datasets in a secure manner. That can be achieved using the `sda-cli` tool and and it can be done in two ways: - -- by downloading from a S3 bucket (`./sda-cli download`) -- by using the download API (`./sda-cli sda-download`) - -### Download from S3 bucket - -This process consists of the following two steps: create keys and downloading the file. These steps are explained in the following sections. - -#### Create keys - -In order to make sure that the files are downloaded from the archive in a secure manner, the user is supposed to create the key pair that the files will be encrypted with. The key pair can be created using the following command: - -```bash -./sda-cli createKey -``` - -where `` is the base name of the key files. This command will create two keys named `keypair_name.pub.pem` and `keypair_name.sec.pem`. The public key (`pub`) will be used for the encryption of the files, while the private one (`sec`) will be used in the decryption step below. - -**NOTE:** Make sure to keep these keys safe. Losing the keys could lead to sensitive data leaks. - -#### Download file - -The `sda-cli` tool allows for downloading file(s)/datasets. The URLs of the respective dataset files that are available for downloading are stored in a file named `urls_list.txt`. `sda-cli` allows to download files only by using such a file or the URL where it is stored. There are three different ways to pass the location of the file to the tool, similar to the [dataset size section](#get-dataset-size): - -1. a direct URL to `urls_list.txt` or a file with a different name but containing the locations of the dataset files -2. a URL to a folder containing the `urls_list.txt` file -3. the path to a local file containing the locations of the dataset files. - -Given this argument, the whole dataset can be retrieved using the following command: - -```bash -./sda-cli download -``` - -where `urls_file` as described above. -The tool also allows for selecting a folder where the files will be downloaded, using the `outdir` argument like: - -```bash -./sda-cli download -outdir -``` - -**Note**: If needed, the user can download a selection of files from an available dataset by providing a customized `urls_list.txt` file. - -### Download using the download API +Files and datasets can be downloaded using the `download` parameter. +This utilizes the Download API which enables secure downloads from the SDA/BP archive. The download API allows for downloading files from the archive and it requires the user to have access to the dataset, therefore a [configuration file](#download-the-configuration-file) needs to be downloaded before starting the downloading of the files. For downloading files the user also needs to know the download service URL and the dataset ID. The user has several options for downloading: @@ -269,14 +210,14 @@ For downloading files the user also needs to know the download service URL and t For downloading one specific file the user needs to provide the path or the id (the id should **NOT** have "/") of this file by running the command below: ```bash -./sda-cli sda-download -config -dataset-id -url [ or ] +./sda-cli download -config -dataset-id -url [ or ] ``` where `` the file downloaded in the [previous step](#download-the-configuration-file), `` the ID of the dataset and `` the path of the file (or `` the id of the file) in the dataset. The tool also allows for downloading multiple files at once, by listing their filepaths (or file ids) separated with space and it also allows for selecting a folder where the files will be downloaded, using the `outdir` argument: ```bash -./sda-cli sda-download -config -dataset-id -url -outdir ... (or ...) +./sda-cli download -config -dataset-id -url -outdir ... (or ...) ``` #### Download files recursively @@ -284,7 +225,7 @@ The tool also allows for downloading multiple files at once, by listing their fi For downloading the content of a folder (including subfolders) the user need to add the `--recursive` flag followed by the path(s) of the folder(s): ```bash -./sda-cli sda-download -config -dataset-id -url -outdir --recursive path/to/folder1 path/to/folder2 ... +./sda-cli download -config -dataset-id -url -outdir --recursive path/to/folder1 path/to/folder2 ... ``` #### Download from file @@ -293,7 +234,7 @@ For downloading multiple files the user can provide a text file with the paths o In this case user needs to use the `--from-file` flag and at the end user needs to provide the path of the text file with the paths of the files to download: ```bash -./sda-cli sda-download -config -dataset-id -url -outdir --from-file +./sda-cli download -config -dataset-id -url -outdir --from-file ``` #### Download all the files of the dataset @@ -301,7 +242,7 @@ In this case user needs to use the `--from-file` flag and at the end user needs For downloading the whole dataset the user needs add the `--dataset` flag and NOT providing any filepaths: ```bash -./sda-cli sda-download -config -dataset-id -url -outdir --dataset +./sda-cli download -config -dataset-id -url -outdir --dataset ``` where the dataset will be downloaded in the `` directory be keeping the original folder structure of the dataset. @@ -311,7 +252,7 @@ where the dataset will be downloaded in the `` directory be keeping the When a [public key](#create-keys) is provided, you can download files that are encrypted on the server-side with that public key. The command is similar to downloading the unencrypted files except that a public key is provided through the `-pubkey` flag. For example: ```bash -./sda-cli sda-download -pubkey -config -dataset-id -url -outdir ... +./sda-cli download -pubkey -config -dataset-id -url -outdir ... ``` After a successful download, the encrypted files can be [decrypted](#decrypt-file) using the private key corresponding to the provided public key. diff --git a/datasetsize/datasetsize.go b/datasetsize/datasetsize.go deleted file mode 100644 index 861335ee..00000000 --- a/datasetsize/datasetsize.go +++ /dev/null @@ -1,108 +0,0 @@ -package datasetsize - -import ( - "flag" - "fmt" - "net/http" - "os" - "strconv" - "strings" - - "github.com/NBISweden/sda-cli/download" - "github.com/inhies/go-bytesize" - log "github.com/sirupsen/logrus" -) - -// Help text and command line flags. - -// Usage text that will be displayed as command line help text when using the -// `help list` command -var Usage = ` -USAGE: %s datasetsize [url(s) | file] - -datasetsize: - List files that can be downloaded from the Sensitive Data - Archive (SDA). If a URL is provided (ending with "/" or the - urls_list.txt file), then the tool will attempt to first download - the urls_list.txt file, and then return a list of the files with - their respective sizes. -` - -// ArgHelp is the suffix text that will be displayed after the argument list in -// the module help -var ArgHelp = ` - [url] - The first flagless argument will be used as file location.` - -// Args is a flagset that needs to be exported so that it can be written to the -// main program help -var Args = flag.NewFlagSet("datasetsize", flag.ExitOnError) - -// Function to return the size of a file -func getFileSize(file string) (downloadSize int64, err error) { - resp, err := http.Head(file) - if err != nil { - return 0, fmt.Errorf("failed to head file, reason: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return 0, fmt.Errorf("failed to get file, code response not 200") - } - - size, _ := strconv.Atoi(resp.Header.Get("Content-Length")) - downloadSize = int64(size) - - return downloadSize, nil -} - -// DatasetSize function returns the list of the files available for downloading and their -// respective size. The argument can be a local file or a url to an S3 folder -func DatasetSize(args []string) error { - // Parse flags. There are no flags at the moment, but in case some are added - // we check for them. - err := Args.Parse(args[1:]) - if err != nil { - return fmt.Errorf("failed parsing arguments, reason: %v", err) - } - - // Args() returns the non-flag arguments, which we assume are filenames. - urls := Args.Args() - if len(urls) == 0 { - return fmt.Errorf("failed to find location of files, no argument passed") - } - - var currentPath, urlsFilePath string - currentPath, err = os.Getwd() - if err != nil { - return fmt.Errorf("failed to get current path, reason: %v", err) - } - - urlsFilePath, err = download.GetURLsListFile(currentPath, urls[0]) - if err != nil { - return fmt.Errorf("failed to get urls list file, reason: %v", err) - } - - // Open urls_list.txt file and loop through file urls - urlsList, err := download.GetURLsFile(urlsFilePath) - if err != nil { - return err - } - - var datasetSize float64 - // Get the size for each of the files in the list - for _, file := range urlsList { - - downloadSize, err := getFileSize(file) - if err != nil { - return err - } - datasetSize += float64(downloadSize) - fmt.Printf("%s \t %s \n", bytesize.New(float64(downloadSize)), file[strings.LastIndex(file, "/")+1:]) - } - fmt.Printf("Total dataset size: %s \n", bytesize.New(datasetSize)) - - log.Info("finished listing available files") - - return nil -} diff --git a/datasetsize/datasetsize_test.go b/datasetsize/datasetsize_test.go deleted file mode 100644 index d6e1e824..00000000 --- a/datasetsize/datasetsize_test.go +++ /dev/null @@ -1,71 +0,0 @@ -package datasetsize - -import ( - "net/http" - "net/http/httptest" - "os" - "runtime" - "strings" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/suite" -) - -type TestSuite struct { - suite.Suite -} - -func TestConfigTestSuite(t *testing.T) { - suite.Run(t, new(TestSuite)) -} - -func (suite *TestSuite) SetupTest() { - -} - -func (suite *TestSuite) TestNoArgument() { - - os.Args = []string{"filesize"} - - err := DatasetSize(os.Args) - assert.EqualError(suite.T(), err, "failed to find location of files, no argument passed") -} - -func (suite *TestSuite) TestFileDoesNotExist() { - - os.Args = []string{"filesize", "somefile"} - - err := DatasetSize(os.Args) - msg := "open somefile: no such file or directory" - if runtime.GOOS == "windows" { - msg = "open somefile: The system cannot find the file specified." - } - assert.EqualError(suite.T(), err, msg) -} - -// Test the size of the file returned from the function -func (suite *TestSuite) TestGetFileSize() { - fileContent := "some text!" - - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - _, err := w.Write([]byte(fileContent)) - assert.NoError(suite.T(), err) - })) - defer ts.Close() - - fileLocation := ts.URL + "/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/file.c4gh" - size, err := getFileSize(fileLocation) - - assert.Equal(suite.T(), int64(10), size) - assert.NoError(suite.T(), err) -} - -func (suite *TestSuite) TestGetFileSizeFail() { - - fileLocation := "http://url/to/file/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/file.c4gh" - size, err := getFileSize(fileLocation) - - assert.True(suite.T(), strings.HasPrefix(err.Error(), "failed to head file, reason:")) - assert.Equal(suite.T(), int64(0), size) -} diff --git a/download/download.go b/download/download.go index ea45ef1e..5c629773 100644 --- a/download/download.go +++ b/download/download.go @@ -2,17 +2,23 @@ package download import ( "bufio" + "encoding/base64" + "encoding/json" + "errors" "flag" "fmt" "io" "net/http" + "net/mail" + "net/url" "os" "path/filepath" - "regexp" + "slices" "strings" "github.com/NBISweden/sda-cli/helpers" - log "github.com/sirupsen/logrus" + "github.com/vbauerster/mpb/v8" + "github.com/vbauerster/mpb/v8/decor" ) // Help text and command line flags. @@ -20,191 +26,485 @@ import ( // Usage text that will be displayed as command line help text when using the // `help download` command var Usage = ` -USAGE: %s download (-outdir ) [url | file] +USAGE: %s download -config -dataset-id -url (--pubkey ) (-outdir ) ([filepath(s) or fileid(s)] or --dataset or --recursive ) or --from-file download: - Downloads files from the Sensitive Data Archive (SDA). A list with - URLs for files to download must be provided either as a URL directly - to a remote url_list.txt file or to its containing directory - (ending with "/"). Alternatively, the local path to such a file may - be given, instead. The files will be downloaded in the current - directory, if outdir is not defined and their folder structure is - preserved. -` + Downloads files from the Sensitive Data Archive (SDA) by using APIs from the given url. The user + must have been granted access to the datasets (visas) that are to be downloaded. + The files will be downloaded in the current directory, if outdir is not defined. + When the -pubkey flag is used, the downloaded files will be server-side encrypted with the given public key. + If the --dataset flag is used, all files in the dataset will be downloaded. + If the --recursive flag is used, all files in the directory will be downloaded. + If the --from-file flag is used, all the files that are in the file will be downloaded. + ` // ArgHelp is the suffix text that will be displayed after the argument list in // the module help var ArgHelp = ` - [urls] - All flagless arguments will be used as download URLs.` + [datasetID] + The ID of the dataset that the file is part of. + [uri] + All flagless arguments will be used as download uri. + [filepath(s)] + The filepath of the file to download. + [fileid(s)] + The file ID of the file to download. + [dirpath] + The directory path to download all files recursively. + [list-filepath] + The path to the file that contains the list of files to download.` // Args is a flagset that needs to be exported so that it can be written to the // main program help var Args = flag.NewFlagSet("download", flag.ExitOnError) -var outDir = Args.String("outdir", "", - "Directory for downloaded files.") - -// Gets the file name for a URL, using regex -func createFilePathFromURL(file string, baseDir string) (fileName string, err error) { - // Create the file path according to the way files are stored in S3 - // The folder structure comes after the UID described in the regex - re := regexp.MustCompile(`(?i)[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12}/(.*)`) - match := re.FindStringSubmatch(file) - if match == nil || len(match) < 1 { - return fileName, fmt.Errorf("failed to parse url for downloading file") - } - if baseDir != "" && !strings.HasSuffix(baseDir, "/") { - baseDir += "/" - } - fileName = filepath.Join(baseDir, match[1]) - - var filePath string - if strings.Contains(fileName, string(os.PathSeparator)) { - filePath = filepath.Dir(fileName) - err = os.MkdirAll(filePath, os.ModePerm) - if err != nil { - return fileName, err + +var configPath = Args.String("config", "", "S3 config file to use for downloading.") + +var datasetID = Args.String("dataset-id", "", "Dataset ID for the file to download.") + +var URL = Args.String("url", "", "The url of the download server.") + +var outDir = Args.String("outdir", "", "Directory for downloaded files.") + +var datasetdownload = Args.Bool("dataset", false, "Download all the files of the dataset.") + +var pubKeyPath = Args.String("pubkey", "", + "Public key file to use for encryption of files to download.") + +var recursiveDownload = Args.Bool("recursive", false, "Download content of the folder.") + +var fromFile = Args.Bool("from-file", false, "Download files from file list.") + +// necessary for mocking in testing +var getResponseBody = getBody + +// File struct represents the file metadata +type File struct { + FileID string `json:"fileId"` + DatasetID string `json:"datasetId"` + DisplayFileName string `json:"displayFileName"` + FilePath string `json:"filePath"` + FileName string `json:"fileName"` + FileSize int `json:"fileSize"` + DecryptedFileSize int `json:"decryptedFileSize"` + DecryptedFileChecksum string `json:"decryptedFileChecksum"` + DecryptedFileChecksumType string `json:"decryptedFileChecksumType"` + FileStatus string `json:"fileStatus"` + CreatedAt string `json:"createdAt"` + LastModified string `json:"lastModified"` +} + +// Download function downloads files from the SDA by using the +// download's service APIs +func Download(args []string) error { + // Call ParseArgs to take care of all the flag parsing + err := helpers.ParseArgs(args, Args) + if err != nil { + return fmt.Errorf("failed parsing arguments, reason: %v", err) + } + + if *datasetID == "" || *URL == "" || *configPath == "" { + return fmt.Errorf("missing required arguments, dataset, config and url are required") + } + + // Check if both --recursive and --dataset flags are set + if *recursiveDownload && *datasetdownload { + return fmt.Errorf("both --recursive and --dataset flags are set, choose one of them") + } + + // Check that file(s) are not missing if the --dataset flag is not set + if len(Args.Args()) == 0 && !*datasetdownload { + if !*recursiveDownload { + return fmt.Errorf("no files provided for download") } + + return fmt.Errorf("no folders provided for recursive download") } - return fileName, nil -} + // Check if --dataset flag is set and files are provided + if *datasetdownload && len(Args.Args()) > 0 { + return fmt.Errorf( + "files provided with --dataset flag, add either the flag or the file(s), not both", + ) + } -// Downloads a file from the url to the filePath location -func downloadFile(url string, filePath string) error { + // Check if --from-file flag is set and only one file is provided + if *fromFile && len(Args.Args()) != 1 { + return fmt.Errorf( + "one file should be provided with --from-file flag", + ) + } - // Get the file from the provided url - resp, err := http.Get(url) + // Get the configuration file or the .sda-cli-session + config, err := helpers.GetAuth(*configPath) if err != nil { - return fmt.Errorf("failed to download file, reason: %v", err) + return err } - defer resp.Body.Close() - // Check reponse status and report S3 error response - if resp.StatusCode >= 400 { - errorDetails, err := helpers.ParseS3ErrorResponse(resp.Body) + // Check if the token has expired + err = helpers.CheckTokenExpiration(config.AccessToken) + if err != nil { + return err + } + + switch { + // Case where the user is setting the --dataset flag + // then download all the files in the dataset. + // Case where the user is setting the --recursive flag + // then download the content of the path + // Case where the user is setting the --from-file flag + // then download the files from the file list + // Default case, download the provided files. + case *datasetdownload: + err = datasetCase(config.AccessToken) + if err != nil { + return err + } + case *recursiveDownload: + err = recursiveCase(config.AccessToken) + if err != nil { + return err + } + case *fromFile: + err = fileCase(config.AccessToken, true) + if err != nil { + return err + } + default: + err = fileCase(config.AccessToken, false) if err != nil { - log.Error(err.Error()) + return err } + } - return fmt.Errorf("request failed with `%s`, details: %v", resp.Status, errorDetails) + return nil +} + +func datasetCase(token string) error { + fmt.Println("Downloading all files in the dataset") + files, err := GetFilesInfo(*URL, *datasetID, "", token) + if err != nil { + return err } + // Loop through the files and download them + for _, file := range files { + // Download URL for the file + fileURL := *URL + "/files/" + file.FileID + err = downloadFile(fileURL, token, "", file.FilePath) + if err != nil { + return err + } + } + + return nil +} - // Create the file in the current location - out, err := os.Create(filePath) +func recursiveCase(token string) error { + fmt.Println("Downloading content of the path(s)") + // get all the files of the dataset + files, err := GetFilesInfo(*URL, *datasetID, "", token) if err != nil { return err } - defer out.Close() + // check all the provided paths and add a slash + // to each one of them if does not exist and + // append them in a slice + var dirPaths []string + for _, path := range Args.Args() { + if !strings.HasSuffix(path, "/") { + path += "/" + } + dirPaths = append(dirPaths, path) + } + var missingPaths []string + // Loop over all the files of the dataset and + // check if the provided path is part of their filepath. + // If it is then download the file + for _, dirPath := range dirPaths { + pathExists := false + for _, file := range files { + if strings.Contains(file.FilePath, dirPath) { + pathExists = true + fileURL := *URL + "/files/" + file.FileID + err = downloadFile(fileURL, token, "", file.FilePath) + if err != nil { + return err + } + } + } + // If dirPath does not exist add in the list + if !pathExists { + missingPaths = append(missingPaths, dirPath) + } + } + // If all the given paths do not exist then return an error + if len(missingPaths) == len(dirPaths) { + return errors.New("given path(s) do not exist") + } + // If some of the give paths do not exist then just return a message + if len(missingPaths) > 0 { + for _, missingPath := range missingPaths { + fmt.Println("Non existing path: ", missingPath) + } + } - // Write the body to file - _, err = io.Copy(out, resp.Body) - defer out.Close() + return nil +} - return err +func fileCase(token string, fileList bool) error { + var files []string + if fileList { + // get the files from the file list + fmt.Println("Downloading files from file list") + fileList, err := GetURLsFile(Args.Args()[0]) + if err != nil { + return err + } + files = append(files, fileList...) + } else { + // get the files from the arguments + fmt.Println("Downloading files") + files = append(files, Args.Args()...) + } + + *pubKeyPath = strings.TrimSpace(*pubKeyPath) + var pubKeyBase64 string + if *pubKeyPath != "" { + // Read the public key + pubKey, err := os.ReadFile(*pubKeyPath) + if err != nil { + return fmt.Errorf("failed to read public key, reason: %v", err) + } + pubKeyBase64 = base64.StdEncoding.EncodeToString(pubKey) + } + + // Loop through the files and download them + for _, filePath := range files { + fileIDURL, apiFilePath, err := getFileIDURL(*URL, token, pubKeyBase64, *datasetID, filePath) + if err != nil { + return err + } + + err = downloadFile(fileIDURL, token, pubKeyBase64, apiFilePath) + if err != nil { + return err + } + } + return nil } -// GetURLsFile reads the urls_list.txt file and returns the urls of the files in a list -func GetURLsFile(urlsFilePath string) (urlsList []string, err error) { +// downloadFile downloads the file by using the download URL +func downloadFile(uri, token, pubKeyBase64, filePath string) error { + // Check if the file path contains a userID and if it does, + // do not keep it in the file path + filePathSplit := strings.Split(filePath, "/") + if strings.Contains(filePathSplit[0], "_") { + _, err := mail.ParseAddress(strings.ReplaceAll(filePathSplit[0], "_", "@")) + if err == nil { + filePath = strings.Join(filePathSplit[1:], "/") + } + } - urlsFile, err := os.Open(filepath.Clean(urlsFilePath)) + outFilename := filePath + if *outDir != "" { + outFilename = *outDir + "/" + filePath + } + + filePath = strings.TrimSuffix(outFilename, ".c4gh") + + // Get the file body + body, err := getResponseBody(uri, token, pubKeyBase64) if err != nil { - return nil, err + return fmt.Errorf("failed to get file for download, reason: %v", err) } - defer urlsFile.Close() - scanner := bufio.NewScanner(urlsFile) - for scanner.Scan() { - urlsList = append(urlsList, scanner.Text()) + // Create the directory if it does not exist + fileDir := filepath.Dir(filePath) + err = os.MkdirAll(fileDir, os.ModePerm) + if err != nil { + return fmt.Errorf("failed to create directory, reason: %v", err) } - if len(urlsList) == 0 { - return urlsList, fmt.Errorf("failed to get list of files, empty file") + + if pubKeyBase64 != "" { + filePath += ".c4gh" + } + outfile, err := os.Create(filePath) + if err != nil { + return fmt.Errorf("failed to create file, reason: %v", err) } + defer outfile.Close() - return urlsList, scanner.Err() + // Create a new progress container + p := mpb.New() + + // Create a new progress bar with the length of the body + bar := p.AddBar(int64(len(body)), + mpb.PrependDecorators( + decor.CountersKibiByte("% .2f / % .2f"), + ), + ) + + // Create a proxy reader + reader := strings.NewReader(string(body)) + proxyReader := bar.ProxyReader(reader) + + fmt.Printf("Downloading file to %s\n", filePath) + // Copy from the proxy reader (which updates the progress bar) to the file + _, err = io.Copy(outfile, proxyReader) + if err != nil { + return fmt.Errorf("failed to write file, reason: %v", err) + } + + // Wait for the progress bar to finish + p.Wait() + + return nil } -// GetURLsListFile is returning the path to the urls_list.txt by handling the URL -// or path provided by the user. In case of a URL, the file is downloaded in the -// current path -func GetURLsListFile(currentPath string, fileLocation string) (urlsFilePath string, err error) { +// getFileIDURL gets the datset files, parses the JSON response to get the file ID +// and returns the download URL for the file and the filepath from the API response +func getFileIDURL(baseURL, token, pubKeyBase64, dataset, filename string) (string, string, error) { + // Get the files of the dataset + datasetFiles, err := GetFilesInfo(baseURL, dataset, pubKeyBase64, token) + if err != nil { + return "", "", err + } + + // Get the file ID for the filename + var idx int switch { - // Case where the user passes the url to the s3 folder where the data exists - // Download the urls_list.txt file first and then the data files - // e.g. https://some/url/to/folder/ - case strings.HasSuffix(fileLocation, "/") && regexp.MustCompile(`https?://`).MatchString(fileLocation): - urlsFilePath = currentPath + "/urls_list.txt" - err = downloadFile(fileLocation+"urls_list.txt", urlsFilePath) - if err != nil { - return "", err + case strings.Contains(filename, "/"): + // If filename does not have a crypt4gh suffix, add one + if !strings.HasSuffix(filename, ".c4gh") { + filename += ".c4gh" } - // Case where the user passes the url directly to urls_list.txt - // e.g. https://some/url/to/urls_list.txt - case regexp.MustCompile(`https?://`).MatchString(fileLocation): - urlsFilePath = currentPath + "/urls_list.txt" - err = downloadFile(fileLocation, urlsFilePath) - if err != nil { - return "", err - } - // Case where the user passes a file containg the urls to download - // e.g. /some/folder/to/file.txt + idx = slices.IndexFunc( + datasetFiles, + func(f File) bool { return strings.Contains(f.FilePath, filename) }, + ) default: - urlsFilePath = fileLocation + idx = slices.IndexFunc( + datasetFiles, + func(f File) bool { return strings.Contains(f.FileID, filename) }, + ) } - return urlsFilePath, nil -} + if idx == -1 { + return "", "", fmt.Errorf("File not found in dataset %s", filename) + } -// Download function downloads the files included in the urls_list.txt file. -// The argument can be a local file or a url to an S3 folder -func Download(args []string) error { + var url string + // If no public key is provided, retrieve the unencrypted file + if pubKeyBase64 == "" { + url = baseURL + "/files/" + datasetFiles[idx].FileID + } else { + url = baseURL + "/s3-encrypted/" + dataset + "/" + filename + } - // Call ParseArgs to take care of all the flag parsing - err := helpers.ParseArgs(args, Args) + return url, datasetFiles[idx].FilePath, nil +} + +func GetDatasets(baseURL, token string) ([]string, error) { + // Sanitize the base_url + u, err := url.ParseRequestURI(baseURL) + if err != nil || u.Scheme == "" { + return []string{}, fmt.Errorf("invalid base URL") + } + // Make the url for listing datasets + datasetsURL := baseURL + "/metadata/datasets" + // Get the response body from the datasets API + allDatasets, err := getResponseBody(datasetsURL, token, "") if err != nil { - return fmt.Errorf("failed parsing arguments, reason: %v", err) + return []string{}, fmt.Errorf("failed to get datasets, reason: %v", err) + } + // Parse the JSON response + var datasets []string + err = json.Unmarshal(allDatasets, &datasets) + if err != nil { + return []string{}, fmt.Errorf("failed to parse dataset list JSON, reason: %v", err) } - // Args() returns the non-flag arguments, which we assume are filenames. - urls := Args.Args() - if len(urls) == 0 { - return fmt.Errorf("failed to find location of files, no argument passed") + return datasets, nil +} + +// GetFilesInfo gets the files of the dataset by using the dataset ID +func GetFilesInfo(baseURL, dataset, pubKeyBase64, token string) ([]File, error) { + // Sanitize the base_url + u, err := url.ParseRequestURI(baseURL) + if err != nil || u.Scheme == "" { + return []File{}, fmt.Errorf("invalid base URL") } + // Make the url for listing files + filesURL := baseURL + "/metadata/datasets/" + dataset + "/files" + // Get the response body from the files API + allFiles, err := getResponseBody(filesURL, token, pubKeyBase64) + if err != nil { + return []File{}, fmt.Errorf("failed to get files, reason: %v", err) + } + // Parse the JSON response + var files []File + err = json.Unmarshal(allFiles, &files) + if err != nil { + return []File{}, fmt.Errorf("failed to parse file list JSON, reason: %v", err) + } + + return files, nil +} - var currentPath, urlsFilePath string - currentPath, err = os.Getwd() +// getBody gets the body of the response from the URL +func getBody(url, token, pubKeyBase64 string) ([]byte, error) { + req, err := http.NewRequest("GET", url, nil) if err != nil { - return fmt.Errorf("failed to get current path, reason: %v", err) + return nil, fmt.Errorf("failed to create request, reason: %v", err) + } + + // Add headers + req.Header.Add("Authorization", "Bearer "+token) + req.Header.Add("Content-Type", "application/json") + if pubKeyBase64 != "" { + req.Header.Add("Client-Public-Key", pubKeyBase64) } - urlsFilePath, err = GetURLsListFile(currentPath, urls[0]) + // Send the request + client := &http.Client{} + res, err := client.Do(req) if err != nil { - return fmt.Errorf("failed to urls list file, reason: %v", err) + return nil, fmt.Errorf("failed to get response, reason: %v", err) } - // Open urls_list.txt file and loop through file urls - urlsList, err := GetURLsFile(urlsFilePath) + // Check the status code + if res.StatusCode != http.StatusOK { + return nil, fmt.Errorf("server returned status %d", res.StatusCode) + } + + // Read the response body + resBody, err := io.ReadAll(res.Body) if err != nil { - return err + return nil, fmt.Errorf("failed to read response body, reason: %v", err) } - // Download the files and create the folder structure - for _, file := range urlsList { + defer res.Body.Close() - fileName, err := createFilePathFromURL(file, *outDir) - if err != nil { - return err - } + return resBody, nil +} - err = downloadFile(file, fileName) - if err != nil { - return err - } - fmt.Printf("downloaded file from url %s\n", fileName) +// GetURLsFile reads the urls_list.txt file and returns the urls of the files in a list +func GetURLsFile(urlsFilePath string) (urlsList []string, err error) { + + urlsFile, err := os.Open(filepath.Clean(urlsFilePath)) + if err != nil { + return nil, err } + defer urlsFile.Close() - fmt.Println("finished downloading files from url") + scanner := bufio.NewScanner(urlsFile) + for scanner.Scan() { + urlsList = append(urlsList, scanner.Text()) + } + if len(urlsList) == 0 { + return urlsList, fmt.Errorf("failed to get list of files, empty file") + } - return nil + return urlsList, scanner.Err() } diff --git a/download/download_test.go b/download/download_test.go index ff2e415c..348c1caf 100644 --- a/download/download_test.go +++ b/download/download_test.go @@ -1,249 +1,297 @@ package download import ( - "io" - "log" + "fmt" "net/http" "net/http/httptest" "os" "path/filepath" - "runtime" - "strings" "testing" + log "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) type TestSuite struct { suite.Suite + accessToken string } -func TestConfigTestSuite(t *testing.T) { - suite.Run(t, new(TestSuite)) -} - -func (suite *TestSuite) SetupTest() { - -} - -func (suite *TestSuite) TestNoArgument() { - - os.Args = []string{"download"} - - err := Download(os.Args) - assert.EqualError(suite.T(), err, "failed to find location of files, no argument passed") -} - -func (suite *TestSuite) TestdownloadFileWrongUrl() { - - url := "someUrl" - filePath := "." - err := downloadFile(url, filePath) - - assert.EqualError(suite.T(), err, "failed to download file, reason: Get \"someUrl\": unsupported protocol scheme \"\"") -} - -func (suite *TestSuite) TestWrongUrlsFile() { - - urlsListPath, err := os.CreateTemp(os.TempDir(), "urls_list-") - assert.NoError(suite.T(), err) - defer os.Remove(urlsListPath.Name()) - - _, err = GetURLsFile(urlsListPath.Name()) - assert.EqualError(suite.T(), err, "failed to get list of files, empty file") -} - -func (suite *TestSuite) TestCorrectUrlsFile() { - - urlsListFile := `someUrlToFile1 -someUrlToFile2 -someUrlToFile3 -` - - urlsListPath, err := os.CreateTemp(os.TempDir(), "urls_list-") - assert.NoError(suite.T(), err) - defer os.Remove(urlsListPath.Name()) +func createConfigFile(fileName, token string) os.File { + // Create conf file for sda-cli + confFile := fmt.Sprintf(` + access_token = %[1]s + host_base = inbox.dummy.org + encoding = UTF-8 + host_bucket = inbox.dummy.org + multipart_chunk_size_mb = 50 + secret_key = dummy + access_key = dummy + use_https = False + check_ssl_certificate = False + check_ssl_hostname = False + socket_timeout = 30 + human_readable_sizes = True + guess_mime_type = True + encrypt = False + `, token) + + // Create config file + configPath, err := os.CreateTemp(os.TempDir(), fileName) + if err != nil { + log.Panic(err) + } - err = os.WriteFile(urlsListPath.Name(), []byte(urlsListFile), 0600) + // Write config file + err = os.WriteFile(configPath.Name(), []byte(confFile), 0600) if err != nil { log.Printf("failed to write temp config file, %v", err) } - urlsList, err := GetURLsFile(urlsListPath.Name()) - assert.NoError(suite.T(), err) - - assert.Equal(suite.T(), 3, len(urlsList)) + return *configPath } -func (suite *TestSuite) TestWronglyFormatterUrls() { - - fileURL := "someURL" - - _, err := createFilePathFromURL(fileURL, "") - - assert.EqualError(suite.T(), err, "failed to parse url for downloading file") +func TestConfigTestSuite(t *testing.T) { + suite.Run(t, new(TestSuite)) } -func (suite *TestSuite) TestCorrectlyFormatterUrls() { - - fileURL := "https://some/base/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/file.txt" - - _, err := createFilePathFromURL(fileURL, "") - assert.NoError(suite.T(), err) - - _, err = os.Stat("some") - assert.NoError(suite.T(), err) - - // Remove the folder created from the createFilePathFromURL function - _ = os.Remove("some") +func (suite *TestSuite) SetupTest() { + suite.accessToken = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6ImtleXN0b3JlLUNIQU5HRS1NRSJ9.eyJqdGkiOiJWTWpfNjhhcEMxR2FJbXRZdFExQ0ciLCJzdWIiOiJkdW1teSIsImlzcyI6Imh0dHA6Ly9vaWRjOjkwOTAiLCJpYXQiOjE3MDc3NjMyODksImV4cCI6MTg2NTU0NzkxOSwic2NvcGUiOiJvcGVuaWQgZ2E0Z2hfcGFzc3BvcnRfdjEgcHJvZmlsZSBlbWFpbCIsImF1ZCI6IlhDNTZFTDExeHgifQ.ZFfIAOGeM2I5cvqr1qJV74qU65appYjpNJVWevGHjGA5Xk_qoRMFJXmG6AiQnYdMKnJ58sYGNjWgs2_RGyw5NyM3-pgP7EKHdWU4PrDOU84Kosg4IPMSFxbBRAEjR5X04YX_CLYW2MFk_OyM9TIln522_JBVT_jA5WTTHSmBRHntVArYYHvQdF-oFRiqL8JXWlsUBh3tqQ33sZdqd9g64YhTk9a5lEC42gn5Hg9Hm_qvkl5orzEqIg7x9z5706IBE4Zypco5ohrAKsEbA8EKbEBb0jigGgCslQNde2owUyKIkvZYmxHA78X5xpymMp9K--PgbkyMS9GtA-YwOHPs-w" } -// Test that the get request doesn't return an error when the server returns 200 -func (suite *TestSuite) TestDownloadFile() { - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusOK) - })) - defer ts.Close() - - file := "somefile.c4gh" - err := downloadFile(ts.URL, file) - assert.NoError(suite.T(), err) +func (suite *TestSuite) TestInvalidUrl() { + confPath := createConfigFile("s3cmd.conf", suite.accessToken) + + os.Args = []string{ + "download", + "-dataset-id", + "TES01", + "-config", + confPath.Name(), + "-url", + "https://some/url", + "file1", + "file2", + } - // Remove the file created from the downloadFile function - _ = os.Remove(file) + err := Download(os.Args) + assert.Contains( + suite.T(), + err.Error(), + "failed to get files, reason: failed to get response, reason: Get \"https://some/url/metadata/datasets/TES01/files\": dial tcp: lookup some", + ) } -// Test that the get returns an error when response code is >=400 and that -// the error is parsed correctly when the S3 backend response is in xml -func (suite *TestSuite) TestdownloadFileErrorStatusCode() { - - file := "somefile.c4gh" - - // Case when the user tried to download from a private bucket - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusNotFound) - _, _ = io.WriteString(w, "\nNoSuchKeyThe specified key does not exist.A352764B-2KB4-4738-B6B5-BA55D25FB469download/download/A352764B-2KB4-4738-B6B5-BA55D25FB4691728F10EAA85663B73e4c710-46e8-4846-b70b-86ee905a3ab0") +func (suite *TestSuite) TestGetBody() { + // Create a test server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + // Set the response status code + w.WriteHeader(http.StatusOK) + // Set the response body + fmt.Fprint(w, "test response") })) - defer ts.Close() + defer server.Close() - err := downloadFile(ts.URL, file) - assert.EqualError(suite.T(), err, "request failed with `404 Not Found`, details: {Code:NoSuchKey Message:The specified key does not exist. Resource:/download/A352764B-2KB4-4738-B6B5-BA55D25FB469}") + // Make a request to the test server with an empty public key + body, err := getBody(server.URL, "test-token", "") + if err != nil { + suite.T().Errorf("getBody returned an error: %v", err) + } - // Case when the user tried to download from a private bucket - ts = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusForbidden) - _, _ = io.WriteString(w, "\nAllAccessDisabledAll access to this bucket has been disabled./minio/test/dummy/data_file1.c4gh73e4c710-46e8-4846-b70b-86ee905a3ab0") - })) - defer ts.Close() + // Check the response body + expectedBody := "test response" + if string(body) != expectedBody { + suite.T(). + Errorf("getBody returned incorrect response body, got: %s, want: %s", string(body), expectedBody) + } - err = downloadFile(ts.URL, file) - assert.EqualError(suite.T(), err, "request failed with `403 Forbidden`, details: {Code:AllAccessDisabled Message:All access to this bucket has been disabled. Resource:/minio/test/dummy/data_file1.c4gh}") + // Make a request to the test server using a public key + body, err = getBody(server.URL, "test-token", "test-public-key") + if err != nil { + suite.T().Errorf("getBody returned an error: %v", err) + } - // Check that the downloadFile function did not create any file in case of error - msg := "stat somefile.c4gh: no such file or directory" - if runtime.GOOS == "windows" { - msg = "CreateFile somefile.c4gh: The system cannot find the file specified." + // Check the response body + expectedBody = "test response" + if string(body) != expectedBody { + suite.T(). + Errorf("getBody returned incorrect response body, got: %s, want: %s", string(body), expectedBody) } - _, err = os.Stat(file) - assert.EqualError(suite.T(), err, msg) } -func (suite *TestSuite) TestCreateFilePath() { +func (suite *TestSuite) TestDownloadUrl() { + // Mock getBody function + defer func() { getResponseBody = getBody }() + getResponseBody = func(_, _, _ string) ([]byte, error) { + return []byte(`[ + { + "fileId": "file1id", + "datasetId": "TES01", + "displayName": "file1", + "filePath": "path/to/file1.c4gh", + "fileName": "4293c9a7-re60-46ac-b79a-40ddc0ddd1c6" + } + ]`), nil + } - fileName := "https://some/base/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/file.txt" - baseDir := "one/directory" + baseURL := "https://some/url" + token := suite.accessToken + datasetID := "test-dataset" + filepath := "path/to/file1" + expectedURL := "https://some/url/files/file1id" - expect := filepath.Join("one", "directory", "some", "file.txt") - path, err := createFilePathFromURL(fileName, baseDir) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expect, path) + //----------------------------------------------- + // Test with an empty public key - _, err = os.Stat(baseDir) + // Test with valid base_url, token, dataset, and filename + url, _, err := getFileIDURL(baseURL, token, "", datasetID, filepath) assert.NoError(suite.T(), err) + assert.Equal(suite.T(), expectedURL, url) - err = os.RemoveAll("one") + // Test with url as dataset + datasetID = "https://doi.example/another/url/001" + _, _, err = getFileIDURL(baseURL, token, "", datasetID, filepath) assert.NoError(suite.T(), err) -} + assert.Equal(suite.T(), expectedURL, url) -func (suite *TestSuite) TestGetURLsListFile() { + // Test with filename not in response + filepath = "path/to/file2" + _, _, err = getFileIDURL(baseURL, token, "", datasetID, filepath) + assert.Error(suite.T(), err) - currentPath, err := os.Getwd() + // Test with fileID + filepath = "file1id" + _, _, err = getFileIDURL(baseURL, token, "", datasetID, filepath) assert.NoError(suite.T(), err) - // Folder URL does not exist - fileLocation := "https://some/base/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/" + // Testr with bad URL + _, _, err = getFileIDURL("some/url", token, "", datasetID, filepath) + assert.Error(suite.T(), err) - urlsFilePath, err := GetURLsListFile(currentPath, fileLocation) - assert.Equal(suite.T(), urlsFilePath, "") - // The error differs locally and in the repo, therefore checking that error starts - // with the specified phrase instead of the whole message - assert.True(suite.T(), strings.HasPrefix(err.Error(), "failed to download file, reason:")) + //----------------------------------------------- + // Test using a nonempty public key + // Test with valid base_url, token, dataset, and filename + expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath + pubKey := "test-public-key" + url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) + assert.NoError(suite.T(), err) + assert.Equal(suite.T(), expectedURL, url) - // File URL does not exist - fileLocation = "https://some/base/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/urls_list.txt" + // Test with url as dataset + datasetID = "https://doi.example/another/url/001" + expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath + url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) + assert.NoError(suite.T(), err) + assert.Equal(suite.T(), expectedURL, url) - urlsFilePath, err = GetURLsListFile(currentPath, fileLocation) - assert.Equal(suite.T(), urlsFilePath, "") - // The error differs locally and in the repo, therefore checking that error starts - // with the specified phrase instead of the whole message - assert.True(suite.T(), strings.HasPrefix(err.Error(), "failed to download file, reason:")) + // Test with filename not in response + filepath = "path/to/file2" + _, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) + assert.Error(suite.T(), err) - // File path - fileLocation = "some/path/to/urls_list.txt" - urlsFilePath, err = GetURLsListFile(currentPath, fileLocation) - assert.Equal(suite.T(), urlsFilePath, fileLocation) - assert.NoError(suite.T(), err) + // Testr with bad URL + _, _, err = getFileIDURL("some/url", token, pubKey, datasetID, filepath) + assert.Error(suite.T(), err) } -func (suite *TestSuite) TestGetURLsListFilePass() { - urlsList := `http://url/to/file1.c4gh -http://url/to/file2.c4gh -http://url/to/file3.c4gh -` +func (suite *TestSuite) TestDownloadFile() { + // Create a temporary directory for testing + tempDir := suite.T().TempDir() + + // Create a temporary file for testing + tempFile := filepath.Join(tempDir, "dummy-file.txt") + err := os.WriteFile(tempFile, []byte("test content"), 0600) + require.NoError(suite.T(), err) - ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - _, err := w.Write([]byte(urlsList)) - assert.NoError(suite.T(), err) + // Create a test server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + // Set the response status code + w.WriteHeader(http.StatusOK) + // Set the response body + fmt.Fprint(w, "dummy response") })) - defer ts.Close() + defer server.Close() - file, err := os.Getwd() - if err != nil { - log.Printf("failed to get current directory, %v", err) - } + // Call the downloadFile function without a public key + err = downloadFile(server.URL, "test-token", "", tempFile) + require.NoError(suite.T(), err) - // Testing with url containing the file - fileLocation := ts.URL + "/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/urls_list.txt" - urlsFilePath, err := GetURLsListFile(file, fileLocation) - assert.NoError(suite.T(), err) - // Check that the file exists - _, err = os.Stat(urlsFilePath) - assert.NoError(suite.T(), err) + // Read the downloaded file + downloadedContent, err := os.ReadFile(tempFile) + require.NoError(suite.T(), err) - // Check that the file contains the correct urls - expectedUrls, err := os.ReadFile(urlsFilePath) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expectedUrls, []byte(urlsList)) + // Check if the downloaded content matches the expected content + expectedContent := "dummy response" + assert.Equal(suite.T(), expectedContent, string(downloadedContent)) - // Remove the file created from the downloadFile function - _ = os.Remove(urlsFilePath) + // Call the downloadFile function with a public key + err = downloadFile(server.URL, "test-token", "test-public-key", tempFile) + require.NoError(suite.T(), err) - // Testing with the URL containing the file folder - fileLocation = ts.URL + "/A352744B-2CB4-4738-B6B5-BA55D25FB469/some/" - urlsFilePath, err = GetURLsListFile(file, fileLocation) - assert.NoError(suite.T(), err) + // Read the downloaded file + downloadedContent, err = os.ReadFile(tempFile) + require.NoError(suite.T(), err) - // Check that the file exists - _, err = os.Stat(urlsFilePath) - assert.NoError(suite.T(), err) + // Check if the downloaded content matches the expected content + expectedContent = "dummy response" + assert.Equal(suite.T(), expectedContent, string(downloadedContent)) +} - // Check that the file contains the correct urls - expectedUrls, err = os.ReadFile(urlsFilePath) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expectedUrls, []byte(urlsList)) +func (suite *TestSuite) TestGetFilesInfo() { + // Mock getBody function + defer func() { getResponseBody = getBody }() + getResponseBody = func(_, _, _ string) ([]byte, error) { + return []byte(`[ + { + "fileId": "file1id", + "datasetId": "TES01", + "displayFileName": "file1", + "filePath": "path/to/file1", + "fileName": "4293c9a7-re60-46ac-b79a-40ddc0ddd1c6" + }, + { + "fileId": "file2id", + "datasetId": "TES01", + "displayFileName": "file2", + "filePath": "path/to/file2", + "fileName": "4b40bd16-9eba-4992-af39-a7f824e612e2" + } + ]`), nil + } + + // Test + token := suite.accessToken + baseURL := "https://some/url" + datasetID := "test-dataset" + files, err := GetFilesInfo(baseURL, datasetID, "", token) + require.NoError(suite.T(), err) + require.Len(suite.T(), files, 2) + assert.Equal(suite.T(), "file1id", files[0].FileID) + assert.Equal(suite.T(), "file1", files[0].DisplayFileName) + assert.Equal(suite.T(), "path/to/file1", files[0].FilePath) + assert.Equal(suite.T(), "4293c9a7-re60-46ac-b79a-40ddc0ddd1c6", files[0].FileName) + assert.Equal(suite.T(), "TES01", files[0].DatasetID) + assert.Equal(suite.T(), "file2id", files[1].FileID) + assert.Equal(suite.T(), "file2", files[1].DisplayFileName) + assert.Equal(suite.T(), "path/to/file2", files[1].FilePath) + assert.Equal(suite.T(), "4b40bd16-9eba-4992-af39-a7f824e612e2", files[1].FileName) + assert.Equal(suite.T(), "TES01", files[1].DatasetID) +} - // Remove the file created from the downloadFile function - _ = os.Remove(urlsFilePath) +func (suite *TestSuite) TestGetDatasets() { + // Mock getBody function + defer func() { getResponseBody = getBody }() + getResponseBody = func(_, _, _ string) ([]byte, error) { + return []byte(`["https://doi.example/ty009.sfrrss/600.45asasga"]`), nil + } + // Test + token := suite.accessToken + baseURL := "https://some/url" + datasets, err := GetDatasets(baseURL, token) + require.NoError(suite.T(), err) + // assert.Contains(suite.T(), datasets, "https://doi.example/ty009.sfrrss/600.45asasga") + assert.Equal(suite.T(), datasets, []string{"https://doi.example/ty009.sfrrss/600.45asasga"}) } diff --git a/list/list.go b/list/list.go index 85b84293..c284a763 100644 --- a/list/list.go +++ b/list/list.go @@ -6,8 +6,8 @@ import ( "strings" + "github.com/NBISweden/sda-cli/download" "github.com/NBISweden/sda-cli/helpers" - sdaDownload "github.com/NBISweden/sda-cli/sda_download" "github.com/inhies/go-bytesize" ) @@ -104,7 +104,7 @@ func List(args []string) error { } func DatasetFiles(token string) error { - files, err := sdaDownload.GetFilesInfo(*URL, *dataset, "", token) + files, err := download.GetFilesInfo(*URL, *dataset, "", token) if err != nil { return err } @@ -117,7 +117,7 @@ func DatasetFiles(token string) error { } func Datasets(token string) error { - datasets, err := sdaDownload.GetDatasets(*URL, token) + datasets, err := download.GetDatasets(*URL, token) if err != nil { return err } diff --git a/main.go b/main.go index 6e3d9286..dff4c4ae 100644 --- a/main.go +++ b/main.go @@ -6,7 +6,6 @@ import ( "os" createKey "github.com/NBISweden/sda-cli/create_key" - "github.com/NBISweden/sda-cli/datasetsize" "github.com/NBISweden/sda-cli/decrypt" "github.com/NBISweden/sda-cli/download" "github.com/NBISweden/sda-cli/encrypt" @@ -14,7 +13,6 @@ import ( "github.com/NBISweden/sda-cli/htsget" "github.com/NBISweden/sda-cli/list" "github.com/NBISweden/sda-cli/login" - sdaDownload "github.com/NBISweden/sda-cli/sda_download" "github.com/NBISweden/sda-cli/upload" "github.com/NBISweden/sda-cli/version" log "github.com/sirupsen/logrus" @@ -36,17 +34,15 @@ type commandInfo struct { } var Commands = map[string]commandInfo{ - "encrypt": {encrypt.Args, encrypt.Usage, encrypt.ArgHelp}, - "createKey": {createKey.Args, createKey.Usage, createKey.ArgHelp}, - "decrypt": {decrypt.Args, decrypt.Usage, decrypt.ArgHelp}, - "download": {download.Args, download.Usage, download.ArgHelp}, - "upload": {upload.Args, upload.Usage, upload.ArgHelp}, - "datasetsize": {datasetsize.Args, datasetsize.Usage, datasetsize.ArgHelp}, - "list": {list.Args, list.Usage, list.ArgHelp}, - "htsget": {htsget.Args, htsget.Usage, htsget.ArgHelp}, - "login": {login.Args, login.Usage, login.ArgHelp}, - "sda-download": {sdaDownload.Args, sdaDownload.Usage, sdaDownload.ArgHelp}, - "version": {version.Args, version.Usage, version.ArgHelp}, + "encrypt": {encrypt.Args, encrypt.Usage, encrypt.ArgHelp}, + "createKey": {createKey.Args, createKey.Usage, createKey.ArgHelp}, + "decrypt": {decrypt.Args, decrypt.Usage, decrypt.ArgHelp}, + "upload": {upload.Args, upload.Usage, upload.ArgHelp}, + "list": {list.Args, list.Usage, list.ArgHelp}, + "htsget": {htsget.Args, htsget.Usage, htsget.ArgHelp}, + "login": {login.Args, login.Usage, login.ArgHelp}, + "download": {download.Args, download.Usage, download.ArgHelp}, + "version": {version.Args, version.Usage, version.ArgHelp}, } // Main does argument parsing, then delegates to one of the sub modules @@ -64,20 +60,16 @@ func main() { err = createKey.CreateKey(args) case "decrypt": err = decrypt.Decrypt(args) - case "download": - err = download.Download(args) case "upload": err = upload.Upload(args) - case "datasetsize": - err = datasetsize.DatasetSize(args) case "list": err = list.List(args) case "htsget": err = htsget.Htsget(args) case "login": err = login.NewLogin(args) - case "sda-download": - err = sdaDownload.SdaDownload(args) + case "download": + err = download.Download(args) case "version": err = version.Version(Version) default: diff --git a/sda_download/sda_download.go b/sda_download/sda_download.go deleted file mode 100644 index c22bfb7b..00000000 --- a/sda_download/sda_download.go +++ /dev/null @@ -1,490 +0,0 @@ -package sdadownload - -import ( - "encoding/base64" - "encoding/json" - "errors" - "flag" - "fmt" - "io" - "net/http" - "net/mail" - "net/url" - "os" - "path/filepath" - "slices" - "strings" - - s3Download "github.com/NBISweden/sda-cli/download" - "github.com/NBISweden/sda-cli/helpers" - "github.com/vbauerster/mpb/v8" - "github.com/vbauerster/mpb/v8/decor" -) - -// Help text and command line flags. - -// Usage text that will be displayed as command line help text when using the -// `help download` command -var Usage = ` -USAGE: %s sda-download -config -dataset-id -url (--pubkey ) (-outdir ) ([filepath(s) or fileid(s)] or --dataset or --recursive ) or --from-file - -sda-download: - Downloads files from the Sensitive Data Archive (SDA) by using APIs from the given url. The user - must have been granted access to the datasets (visas) that are to be downloaded. - The files will be downloaded in the current directory, if outdir is not defined. - When the -pubkey flag is used, the downloaded files will be server-side encrypted with the given public key. - If the --dataset flag is used, all files in the dataset will be downloaded. - If the --recursive flag is used, all files in the directory will be downloaded. - If the --from-file flag is used, all the files that are in the file will be downloaded. - ` - -// ArgHelp is the suffix text that will be displayed after the argument list in -// the module help -var ArgHelp = ` - [datasetID] - The ID of the dataset that the file is part of. - [uri] - All flagless arguments will be used as sda-download uri. - [filepath(s)] - The filepath of the file to download. - [fileid(s)] - The file ID of the file to download. - [dirpath] - The directory path to download all files recursively. - [list-filepath] - The path to the file that contains the list of files to download.` - -// Args is a flagset that needs to be exported so that it can be written to the -// main program help -var Args = flag.NewFlagSet("sda-download", flag.ExitOnError) - -var configPath = Args.String("config", "", "S3 config file to use for downloading.") - -var datasetID = Args.String("dataset-id", "", "Dataset ID for the file to download.") - -var URL = Args.String("url", "", "The url of the sda-download server.") - -var outDir = Args.String("outdir", "", "Directory for downloaded files.") - -var datasetdownload = Args.Bool("dataset", false, "Download all the files of the dataset.") - -var pubKeyPath = Args.String("pubkey", "", - "Public key file to use for encryption of files to download.") - -var recursiveDownload = Args.Bool("recursive", false, "Download content of the folder.") - -var fromFile = Args.Bool("from-file", false, "Download files from file list.") - -// necessary for mocking in testing -var getResponseBody = getBody - -// File struct represents the file metadata -type File struct { - FileID string `json:"fileId"` - DatasetID string `json:"datasetId"` - DisplayFileName string `json:"displayFileName"` - FilePath string `json:"filePath"` - FileName string `json:"fileName"` - FileSize int `json:"fileSize"` - DecryptedFileSize int `json:"decryptedFileSize"` - DecryptedFileChecksum string `json:"decryptedFileChecksum"` - DecryptedFileChecksumType string `json:"decryptedFileChecksumType"` - FileStatus string `json:"fileStatus"` - CreatedAt string `json:"createdAt"` - LastModified string `json:"lastModified"` -} - -// SdaDownload function downloads files from the SDA by using the -// download's service APIs -func SdaDownload(args []string) error { - // Call ParseArgs to take care of all the flag parsing - err := helpers.ParseArgs(args, Args) - if err != nil { - return fmt.Errorf("failed parsing arguments, reason: %v", err) - } - - if *datasetID == "" || *URL == "" || *configPath == "" { - return fmt.Errorf("missing required arguments, dataset, config and url are required") - } - - // Check if both --recursive and --dataset flags are set - if *recursiveDownload && *datasetdownload { - return fmt.Errorf("both --recursive and --dataset flags are set, choose one of them") - } - - // Check that file(s) are not missing if the --dataset flag is not set - if len(Args.Args()) == 0 && !*datasetdownload { - if !*recursiveDownload { - return fmt.Errorf("no files provided for download") - } - - return fmt.Errorf("no folders provided for recursive download") - } - - // Check if --dataset flag is set and files are provided - if *datasetdownload && len(Args.Args()) > 0 { - return fmt.Errorf( - "files provided with --dataset flag, add either the flag or the file(s), not both", - ) - } - - // Check if --from-file flag is set and only one file is provided - if *fromFile && len(Args.Args()) != 1 { - return fmt.Errorf( - "one file should be provided with --from-file flag", - ) - } - - // Get the configuration file or the .sda-cli-session - config, err := helpers.GetAuth(*configPath) - if err != nil { - return err - } - - // Check if the token has expired - err = helpers.CheckTokenExpiration(config.AccessToken) - if err != nil { - return err - } - - switch { - // Case where the user is setting the --dataset flag - // then download all the files in the dataset. - // Case where the user is setting the --recursive flag - // then download the content of the path - // Case where the user is setting the --from-file flag - // then download the files from the file list - // Default case, download the provided files. - case *datasetdownload: - err = datasetCase(config.AccessToken) - if err != nil { - return err - } - case *recursiveDownload: - err = recursiveCase(config.AccessToken) - if err != nil { - return err - } - case *fromFile: - err = fileCase(config.AccessToken, true) - if err != nil { - return err - } - default: - err = fileCase(config.AccessToken, false) - if err != nil { - return err - } - } - - return nil -} - -func datasetCase(token string) error { - fmt.Println("Downloading all files in the dataset") - files, err := GetFilesInfo(*URL, *datasetID, "", token) - if err != nil { - return err - } - // Loop through the files and download them - for _, file := range files { - // Download URL for the file - fileURL := *URL + "/files/" + file.FileID - err = downloadFile(fileURL, token, "", file.FilePath) - if err != nil { - return err - } - } - - return nil -} - -func recursiveCase(token string) error { - fmt.Println("Downloading content of the path(s)") - // get all the files of the dataset - files, err := GetFilesInfo(*URL, *datasetID, "", token) - if err != nil { - return err - } - // check all the provided paths and add a slash - // to each one of them if does not exist and - // append them in a slice - var dirPaths []string - for _, path := range Args.Args() { - if !strings.HasSuffix(path, "/") { - path += "/" - } - dirPaths = append(dirPaths, path) - } - var missingPaths []string - // Loop over all the files of the dataset and - // check if the provided path is part of their filepath. - // If it is then download the file - for _, dirPath := range dirPaths { - pathExists := false - for _, file := range files { - if strings.Contains(file.FilePath, dirPath) { - pathExists = true - fileURL := *URL + "/files/" + file.FileID - err = downloadFile(fileURL, token, "", file.FilePath) - if err != nil { - return err - } - } - } - // If dirPath does not exist add in the list - if !pathExists { - missingPaths = append(missingPaths, dirPath) - } - } - // If all the given paths do not exist then return an error - if len(missingPaths) == len(dirPaths) { - return errors.New("given path(s) do not exist") - } - // If some of the give paths do not exist then just return a message - if len(missingPaths) > 0 { - for _, missingPath := range missingPaths { - fmt.Println("Non existing path: ", missingPath) - } - } - - return nil -} - -func fileCase(token string, fileList bool) error { - var files []string - if fileList { - // get the files from the file list - fmt.Println("Downloading files from file list") - fileList, err := s3Download.GetURLsFile(Args.Args()[0]) - if err != nil { - return err - } - files = append(files, fileList...) - } else { - // get the files from the arguments - fmt.Println("Downloading files") - files = append(files, Args.Args()...) - } - - *pubKeyPath = strings.TrimSpace(*pubKeyPath) - var pubKeyBase64 string - if *pubKeyPath != "" { - // Read the public key - pubKey, err := os.ReadFile(*pubKeyPath) - if err != nil { - return fmt.Errorf("failed to read public key, reason: %v", err) - } - pubKeyBase64 = base64.StdEncoding.EncodeToString(pubKey) - } - - // Loop through the files and download them - for _, filePath := range files { - fileIDURL, apiFilePath, err := getFileIDURL(*URL, token, pubKeyBase64, *datasetID, filePath) - if err != nil { - return err - } - - err = downloadFile(fileIDURL, token, pubKeyBase64, apiFilePath) - if err != nil { - return err - } - } - - return nil -} - -// downloadFile downloads the file by using the download URL -func downloadFile(uri, token, pubKeyBase64, filePath string) error { - // Check if the file path contains a userID and if it does, - // do not keep it in the file path - filePathSplit := strings.Split(filePath, "/") - if strings.Contains(filePathSplit[0], "_") { - _, err := mail.ParseAddress(strings.ReplaceAll(filePathSplit[0], "_", "@")) - if err == nil { - filePath = strings.Join(filePathSplit[1:], "/") - } - } - - outFilename := filePath - if *outDir != "" { - outFilename = *outDir + "/" + filePath - } - - filePath = strings.TrimSuffix(outFilename, ".c4gh") - - // Get the file body - body, err := getResponseBody(uri, token, pubKeyBase64) - if err != nil { - return fmt.Errorf("failed to get file for download, reason: %v", err) - } - - // Create the directory if it does not exist - fileDir := filepath.Dir(filePath) - err = os.MkdirAll(fileDir, os.ModePerm) - if err != nil { - return fmt.Errorf("failed to create directory, reason: %v", err) - } - - if pubKeyBase64 != "" { - filePath += ".c4gh" - } - outfile, err := os.Create(filePath) - if err != nil { - return fmt.Errorf("failed to create file, reason: %v", err) - } - defer outfile.Close() - - // Create a new progress container - p := mpb.New() - - // Create a new progress bar with the length of the body - bar := p.AddBar(int64(len(body)), - mpb.PrependDecorators( - decor.CountersKibiByte("% .2f / % .2f"), - ), - ) - - // Create a proxy reader - reader := strings.NewReader(string(body)) - proxyReader := bar.ProxyReader(reader) - - fmt.Printf("Downloading file to %s\n", filePath) - // Copy from the proxy reader (which updates the progress bar) to the file - _, err = io.Copy(outfile, proxyReader) - if err != nil { - return fmt.Errorf("failed to write file, reason: %v", err) - } - - // Wait for the progress bar to finish - p.Wait() - - return nil -} - -// getFileIDURL gets the datset files, parses the JSON response to get the file ID -// and returns the download URL for the file and the filepath from the API response -func getFileIDURL(baseURL, token, pubKeyBase64, dataset, filename string) (string, string, error) { - // Get the files of the dataset - datasetFiles, err := GetFilesInfo(baseURL, dataset, pubKeyBase64, token) - if err != nil { - return "", "", err - } - - // Get the file ID for the filename - var idx int - switch { - case strings.Contains(filename, "/"): - // If filename does not have a crypt4gh suffix, add one - if !strings.HasSuffix(filename, ".c4gh") { - filename += ".c4gh" - } - idx = slices.IndexFunc( - datasetFiles, - func(f File) bool { return strings.Contains(f.FilePath, filename) }, - ) - default: - idx = slices.IndexFunc( - datasetFiles, - func(f File) bool { return strings.Contains(f.FileID, filename) }, - ) - } - - if idx == -1 { - return "", "", fmt.Errorf("File not found in dataset %s", filename) - } - - var url string - // If no public key is provided, retrieve the unencrypted file - if pubKeyBase64 == "" { - url = baseURL + "/files/" + datasetFiles[idx].FileID - } else { - url = baseURL + "/s3-encrypted/" + dataset + "/" + filename - } - - return url, datasetFiles[idx].FilePath, nil -} - -func GetDatasets(baseURL, token string) ([]string, error) { - // Sanitize the base_url - u, err := url.ParseRequestURI(baseURL) - if err != nil || u.Scheme == "" { - return []string{}, fmt.Errorf("invalid base URL") - } - // Make the url for listing datasets - datasetsURL := baseURL + "/metadata/datasets" - // Get the response body from the datasets API - allDatasets, err := getResponseBody(datasetsURL, token, "") - if err != nil { - return []string{}, fmt.Errorf("failed to get datasets, reason: %v", err) - } - // Parse the JSON response - var datasets []string - err = json.Unmarshal(allDatasets, &datasets) - if err != nil { - return []string{}, fmt.Errorf("failed to parse dataset list JSON, reason: %v", err) - } - - return datasets, nil -} - -// GetFilesInfo gets the files of the dataset by using the dataset ID -func GetFilesInfo(baseURL, dataset, pubKeyBase64, token string) ([]File, error) { - // Sanitize the base_url - u, err := url.ParseRequestURI(baseURL) - if err != nil || u.Scheme == "" { - return []File{}, fmt.Errorf("invalid base URL") - } - // Make the url for listing files - filesURL := baseURL + "/metadata/datasets/" + dataset + "/files" - // Get the response body from the files API - allFiles, err := getResponseBody(filesURL, token, pubKeyBase64) - if err != nil { - return []File{}, fmt.Errorf("failed to get files, reason: %v", err) - } - // Parse the JSON response - var files []File - err = json.Unmarshal(allFiles, &files) - if err != nil { - return []File{}, fmt.Errorf("failed to parse file list JSON, reason: %v", err) - } - - return files, nil -} - -// getBody gets the body of the response from the URL -func getBody(url, token, pubKeyBase64 string) ([]byte, error) { - req, err := http.NewRequest("GET", url, nil) - if err != nil { - return nil, fmt.Errorf("failed to create request, reason: %v", err) - } - - // Add headers - req.Header.Add("Authorization", "Bearer "+token) - req.Header.Add("Content-Type", "application/json") - if pubKeyBase64 != "" { - req.Header.Add("Client-Public-Key", pubKeyBase64) - } - - // Send the request - client := &http.Client{} - res, err := client.Do(req) - if err != nil { - return nil, fmt.Errorf("failed to get response, reason: %v", err) - } - - // Check the status code - if res.StatusCode != http.StatusOK { - return nil, fmt.Errorf("server returned status %d", res.StatusCode) - } - - // Read the response body - resBody, err := io.ReadAll(res.Body) - if err != nil { - return nil, fmt.Errorf("failed to read response body, reason: %v", err) - } - - defer res.Body.Close() - - return resBody, nil -} diff --git a/sda_download/sda_download_test.go b/sda_download/sda_download_test.go deleted file mode 100644 index 1bae150f..00000000 --- a/sda_download/sda_download_test.go +++ /dev/null @@ -1,297 +0,0 @@ -package sdadownload - -import ( - "fmt" - "net/http" - "net/http/httptest" - "os" - "path/filepath" - "testing" - - log "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" -) - -type TestSuite struct { - suite.Suite - accessToken string -} - -func createConfigFile(fileName, token string) os.File { - // Create conf file for sda-cli - confFile := fmt.Sprintf(` - access_token = %[1]s - host_base = inbox.dummy.org - encoding = UTF-8 - host_bucket = inbox.dummy.org - multipart_chunk_size_mb = 50 - secret_key = dummy - access_key = dummy - use_https = False - check_ssl_certificate = False - check_ssl_hostname = False - socket_timeout = 30 - human_readable_sizes = True - guess_mime_type = True - encrypt = False - `, token) - - // Create config file - configPath, err := os.CreateTemp(os.TempDir(), fileName) - if err != nil { - log.Panic(err) - } - - // Write config file - err = os.WriteFile(configPath.Name(), []byte(confFile), 0600) - if err != nil { - log.Printf("failed to write temp config file, %v", err) - } - - return *configPath -} - -func TestConfigTestSuite(t *testing.T) { - suite.Run(t, new(TestSuite)) -} - -func (suite *TestSuite) SetupTest() { - suite.accessToken = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6ImtleXN0b3JlLUNIQU5HRS1NRSJ9.eyJqdGkiOiJWTWpfNjhhcEMxR2FJbXRZdFExQ0ciLCJzdWIiOiJkdW1teSIsImlzcyI6Imh0dHA6Ly9vaWRjOjkwOTAiLCJpYXQiOjE3MDc3NjMyODksImV4cCI6MTg2NTU0NzkxOSwic2NvcGUiOiJvcGVuaWQgZ2E0Z2hfcGFzc3BvcnRfdjEgcHJvZmlsZSBlbWFpbCIsImF1ZCI6IlhDNTZFTDExeHgifQ.ZFfIAOGeM2I5cvqr1qJV74qU65appYjpNJVWevGHjGA5Xk_qoRMFJXmG6AiQnYdMKnJ58sYGNjWgs2_RGyw5NyM3-pgP7EKHdWU4PrDOU84Kosg4IPMSFxbBRAEjR5X04YX_CLYW2MFk_OyM9TIln522_JBVT_jA5WTTHSmBRHntVArYYHvQdF-oFRiqL8JXWlsUBh3tqQ33sZdqd9g64YhTk9a5lEC42gn5Hg9Hm_qvkl5orzEqIg7x9z5706IBE4Zypco5ohrAKsEbA8EKbEBb0jigGgCslQNde2owUyKIkvZYmxHA78X5xpymMp9K--PgbkyMS9GtA-YwOHPs-w" -} - -func (suite *TestSuite) TestInvalidUrl() { - confPath := createConfigFile("s3cmd.conf", suite.accessToken) - - os.Args = []string{ - "sda-download", - "-dataset-id", - "TES01", - "-config", - confPath.Name(), - "-url", - "https://some/url", - "file1", - "file2", - } - - err := SdaDownload(os.Args) - assert.Contains( - suite.T(), - err.Error(), - "failed to get files, reason: failed to get response, reason: Get \"https://some/url/metadata/datasets/TES01/files\": dial tcp: lookup some", - ) -} - -func (suite *TestSuite) TestGetBody() { - // Create a test server - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - // Set the response status code - w.WriteHeader(http.StatusOK) - // Set the response body - fmt.Fprint(w, "test response") - })) - defer server.Close() - - // Make a request to the test server with an empty public key - body, err := getBody(server.URL, "test-token", "") - if err != nil { - suite.T().Errorf("getBody returned an error: %v", err) - } - - // Check the response body - expectedBody := "test response" - if string(body) != expectedBody { - suite.T(). - Errorf("getBody returned incorrect response body, got: %s, want: %s", string(body), expectedBody) - } - - // Make a request to the test server using a public key - body, err = getBody(server.URL, "test-token", "test-public-key") - if err != nil { - suite.T().Errorf("getBody returned an error: %v", err) - } - - // Check the response body - expectedBody = "test response" - if string(body) != expectedBody { - suite.T(). - Errorf("getBody returned incorrect response body, got: %s, want: %s", string(body), expectedBody) - } -} - -func (suite *TestSuite) TestDownloadUrl() { - // Mock getBody function - defer func() { getResponseBody = getBody }() - getResponseBody = func(_, _, _ string) ([]byte, error) { - return []byte(`[ - { - "fileId": "file1id", - "datasetId": "TES01", - "displayName": "file1", - "filePath": "path/to/file1.c4gh", - "fileName": "4293c9a7-re60-46ac-b79a-40ddc0ddd1c6" - } - ]`), nil - } - - baseURL := "https://some/url" - token := suite.accessToken - datasetID := "test-dataset" - filepath := "path/to/file1" - expectedURL := "https://some/url/files/file1id" - - //----------------------------------------------- - // Test with an empty public key - - // Test with valid base_url, token, dataset, and filename - url, _, err := getFileIDURL(baseURL, token, "", datasetID, filepath) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expectedURL, url) - - // Test with url as dataset - datasetID = "https://doi.example/another/url/001" - _, _, err = getFileIDURL(baseURL, token, "", datasetID, filepath) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expectedURL, url) - - // Test with filename not in response - filepath = "path/to/file2" - _, _, err = getFileIDURL(baseURL, token, "", datasetID, filepath) - assert.Error(suite.T(), err) - - // Test with fileID - filepath = "file1id" - _, _, err = getFileIDURL(baseURL, token, "", datasetID, filepath) - assert.NoError(suite.T(), err) - - // Testr with bad URL - _, _, err = getFileIDURL("some/url", token, "", datasetID, filepath) - assert.Error(suite.T(), err) - - //----------------------------------------------- - // Test using a nonempty public key - // Test with valid base_url, token, dataset, and filename - expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath - pubKey := "test-public-key" - url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expectedURL, url) - - // Test with url as dataset - datasetID = "https://doi.example/another/url/001" - expectedURL = baseURL + "/s3-encrypted/" + datasetID + "/" + filepath - url, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) - assert.NoError(suite.T(), err) - assert.Equal(suite.T(), expectedURL, url) - - // Test with filename not in response - filepath = "path/to/file2" - _, _, err = getFileIDURL(baseURL, token, pubKey, datasetID, filepath) - assert.Error(suite.T(), err) - - // Testr with bad URL - _, _, err = getFileIDURL("some/url", token, pubKey, datasetID, filepath) - assert.Error(suite.T(), err) -} - -func (suite *TestSuite) TestDownloadFile() { - // Create a temporary directory for testing - tempDir := suite.T().TempDir() - - // Create a temporary file for testing - tempFile := filepath.Join(tempDir, "dummy-file.txt") - err := os.WriteFile(tempFile, []byte("test content"), 0600) - require.NoError(suite.T(), err) - - // Create a test server - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - // Set the response status code - w.WriteHeader(http.StatusOK) - // Set the response body - fmt.Fprint(w, "dummy response") - })) - defer server.Close() - - // Call the downloadFile function without a public key - err = downloadFile(server.URL, "test-token", "", tempFile) - require.NoError(suite.T(), err) - - // Read the downloaded file - downloadedContent, err := os.ReadFile(tempFile) - require.NoError(suite.T(), err) - - // Check if the downloaded content matches the expected content - expectedContent := "dummy response" - assert.Equal(suite.T(), expectedContent, string(downloadedContent)) - - // Call the downloadFile function with a public key - err = downloadFile(server.URL, "test-token", "test-public-key", tempFile) - require.NoError(suite.T(), err) - - // Read the downloaded file - downloadedContent, err = os.ReadFile(tempFile) - require.NoError(suite.T(), err) - - // Check if the downloaded content matches the expected content - expectedContent = "dummy response" - assert.Equal(suite.T(), expectedContent, string(downloadedContent)) -} - -func (suite *TestSuite) TestGetFilesInfo() { - // Mock getBody function - defer func() { getResponseBody = getBody }() - getResponseBody = func(_, _, _ string) ([]byte, error) { - return []byte(`[ - { - "fileId": "file1id", - "datasetId": "TES01", - "displayFileName": "file1", - "filePath": "path/to/file1", - "fileName": "4293c9a7-re60-46ac-b79a-40ddc0ddd1c6" - }, - { - "fileId": "file2id", - "datasetId": "TES01", - "displayFileName": "file2", - "filePath": "path/to/file2", - "fileName": "4b40bd16-9eba-4992-af39-a7f824e612e2" - } - ]`), nil - } - - // Test - token := suite.accessToken - baseURL := "https://some/url" - datasetID := "test-dataset" - files, err := GetFilesInfo(baseURL, datasetID, "", token) - require.NoError(suite.T(), err) - require.Len(suite.T(), files, 2) - assert.Equal(suite.T(), "file1id", files[0].FileID) - assert.Equal(suite.T(), "file1", files[0].DisplayFileName) - assert.Equal(suite.T(), "path/to/file1", files[0].FilePath) - assert.Equal(suite.T(), "4293c9a7-re60-46ac-b79a-40ddc0ddd1c6", files[0].FileName) - assert.Equal(suite.T(), "TES01", files[0].DatasetID) - assert.Equal(suite.T(), "file2id", files[1].FileID) - assert.Equal(suite.T(), "file2", files[1].DisplayFileName) - assert.Equal(suite.T(), "path/to/file2", files[1].FilePath) - assert.Equal(suite.T(), "4b40bd16-9eba-4992-af39-a7f824e612e2", files[1].FileName) - assert.Equal(suite.T(), "TES01", files[1].DatasetID) -} - -func (suite *TestSuite) TestGetDatasets() { - // Mock getBody function - defer func() { getResponseBody = getBody }() - getResponseBody = func(_, _, _ string) ([]byte, error) { - return []byte(`["https://doi.example/ty009.sfrrss/600.45asasga"]`), nil - } - - // Test - token := suite.accessToken - baseURL := "https://some/url" - datasets, err := GetDatasets(baseURL, token) - require.NoError(suite.T(), err) - // assert.Contains(suite.T(), datasets, "https://doi.example/ty009.sfrrss/600.45asasga") - assert.Equal(suite.T(), datasets, []string{"https://doi.example/ty009.sfrrss/600.45asasga"}) -}