Skip to content

Commit

Permalink
Merge branch 'users/moderakh/spark3-merging-to-master'
Browse files Browse the repository at this point in the history
  • Loading branch information
moderakh committed Mar 29, 2021
2 parents 5708e10 + f7a5dba commit 852122a
Show file tree
Hide file tree
Showing 152 changed files with 13,982 additions and 158 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ venv
nbproject
nb-configuration.xml

# Scala Stylecheck
scalastyle-output.xml

# Emacs #

#changebundle.txt#
307 changes: 306 additions & 1 deletion NOTICE.txt

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions eng/.docsettings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,12 @@ known_content_issues:
- ['sdk/cosmos/azure-cosmos-examples/README.md', '#3113']
- ['sdk/cosmos/azure-cosmos/README.md', '#3113']
- ['sdk/cosmos/azure-cosmos-encryption/README.md', '#3113']
- ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/README.md', '#3113']
- ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/catalog-api.md', '#3113']
- ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/configuration-reference.md', '#3113']
- ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/local-emulator.md', '#3113']
- ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/quick-start.md', '#3113']
- ['sdk/cosmos/azure-cosmos-spark_3-1_2-12/dev/README.md', '#3113']
- ['sdk/cosmos/README.md', '#3113']
- ['sdk/deviceupdate/azure-iot-deviceupdate/swagger/README.md', '#3113']
- ['sdk/e2e/README.md', '#3113']
Expand Down
6 changes: 6 additions & 0 deletions eng/pipelines/templates/stages/cosmos-emulator-matrix.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
"PROTOCOLS": "[\"Tcp\"]",
"DESIRED_CONSISTENCIES": "[\"Session\"]",
"AdditionalArgs": "-DargLine=\"-DACCOUNT_HOST=https://localhost:8081/\""
},
"Spark Integration Tests targeting Cosmos Emulator'": {
"ProfileFlag": "-PsparkE2E",
"PROTOCOLS": "[\"Tcp\"]",
"DESIRED_CONSISTENCIES": "[\"Session\"]",
"AdditionalArgs": "-DargLine=\"-DACCOUNT_HOST=https://localhost:8081/ -Dhadoop.home.dir=D:/Hadoop\""
}
}
}
Expand Down
21 changes: 21 additions & 0 deletions eng/pipelines/templates/stages/cosmos-sdk-client.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,27 @@ stages:
cp $env:temp\CosmosDbEmulatorCert.cer .
keytool -keystore cacerts -importcert -noprompt -trustcacerts -alias CosmosDbEmulatorCert -file CosmosDbEmulatorCert.cer -storepass changeit
displayName: 'Create Java TrustStore'
- powershell: |
$downloadTarget = Join-Path -Path $env:temp -ChildPath "hadoopsource.zip"
$targetDir = "D:\Hadoop"
Write-Host "Downloading and extracting Hadoop winutils - https://aka.ms/cosmos-hadoop-for-spark-ci-pipeline"
Write-Host "Target: $targetDir"
Invoke-WebRequest "https://aka.ms/cosmos-hadoop-for-spark-ci-pipeline" -OutFile $downloadTarget
if (Test-Path $targetDir) { Remove-Item -Recurse -Force $targetDir }
Expand-Archive -LiteralPath $downloadTarget -DestinationPath $targetDir
Write-Host "Copying binaries to windows sytem32 folder"
Copy-Item -Path "D:\Hadoop\bin\*" -Destination "C:\Windows\System32" -Recurse
Write-Host "Updating environment variables for Hadoop usage"
[System.Environment]::SetEnvironmentVariable("HADOOP_HOME", "D:\Hadoop", [System.EnvironmentVariableTarget]::Machine)
$hadoopHome = [System.Environment]::GetEnvironmentVariable("HADOOP_HOME", [System.EnvironmentVariableTarget]::Machine)
Write-Host "New HADOOP_HOME environment variable: $hadoopHome"
$path = [System.Environment]::GetEnvironmentVariable("Path", [System.EnvironmentVariableTarget]::Machine)
Write-Host "Original Path environment variable: $path"
$newPath = $path + ";D:\Hadoop\bin"
[System.Environment]::SetEnvironmentVariable("Path", $newPath,[System.EnvironmentVariableTarget]::Machine)
$updatedPath = [System.Environment]::GetEnvironmentVariable("Path", [System.EnvironmentVariableTarget]::Machine)
Write-Host "New Path environment variable: $updatedPath"
displayName: 'Download and Extract Hadoop winutils and update env variables for Hadoop usage'
# We `install` separately from running `site:site site:stage` so that the `install` brings in the non-shipping-modules,
# but we don't include them in the Maven site commands (so that we don't generate reports for the non-shipping modules).
Expand Down
21 changes: 20 additions & 1 deletion eng/versioning/external_dependencies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,6 @@ org.mockito:mockito-core;3.6.28
org.revapi:revapi-java;0.20.0
org.revapi:revapi-maven-plugin;0.11.2


# External Dependency Exceptions
# This section is for external dependencies whose versions were different than
# what was defined in the parent pom.
Expand Down Expand Up @@ -290,6 +289,26 @@ cosmos_org.mockito:mockito-core;1.10.19
cosmos_org.mpierce.metrics.reservoir:hdrhistogram-metrics-reservoir;1.1.0
cosmos_org.hdrhistogram:HdrHistogram;2.1.12

## Cosmos Spark connector under sdk\cosmos\azure-cosmos-spark_3-1_2-12\pom.xml
# Cosmos Spark connector runtime dependencies - provided by Spark runtime/host
cosmos_org.apache.spark:spark-sql_2.12;3.1.1
cosmos_org.apache.spark:spark-hive_2.12;3.1.1
cosmos_org.scala-lang:scala-library;2.12.10
cosmos_org.scala-lang.modules:scala-java8-compat_2.12;0.8.0
cosmos_io.projectreactor:reactor-scala-extensions_2.12;0.8.0
cosmos_commons-io:commons-io;2.4

# Cosmos Spark connector tests only
cosmos_org.scalatest:scalatest_2.12;3.2.2
cosmos_org.scalatest:scalatest-flatspec_2.12;3.2.3
cosmos_org.scalactic:scalactic_2.12;3.2.3
cosmos_org.scalamock:scalamock_2.12;5.0.0

# Maven Tools for Cosmos Spark connector only
cosmos_org.scalatest:scalatest-maven-plugin;2.0.2
cosmos_net.alchim31.maven:scala-maven-plugin;4.4.0
cosmos_org.scalastyle:scalastyle-maven-plugin;1.0.0

# sdk\core\azure-core-serializer-avro-jackson\pom.xml
# This dependency is needed since Jackson Avro uses an older dependency on Apache Avro which is another library.
jacksonavro_org.apache.avro:avro-maven-plugin;1.8.2
Expand Down
1 change: 1 addition & 0 deletions eng/versioning/version_client.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ com.azure:azure-cosmos;4.13.1;4.14.0-beta.1
com.azure:azure-cosmos-benchmark;4.0.1-beta.1;4.0.1-beta.1
com.azure:azure-cosmos-dotnet-benchmark;4.0.1-beta.1;4.0.1-beta.1
com.azure:azure-cosmos-encryption;1.0.0-beta.1;1.0.0-beta.1
com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12;4.0.0-beta.2;4.0.0-beta.2
com.azure:azure-data-appconfiguration;1.1.10;1.2.0-beta.1
com.azure:azure-data-schemaregistry;1.0.0-beta.4;1.0.0-beta.5
com.azure:azure-data-schemaregistry-avro;1.0.0-beta.4;1.0.0-beta.5
Expand Down
4 changes: 4 additions & 0 deletions sdk/cosmos/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.log

metastore_db/*
spark-warehouse/*
4 changes: 4 additions & 0 deletions sdk/cosmos/azure-cosmos-spark_3-1_2-12/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*.log

metastore_db/*
spark-warehouse/*
32 changes: 32 additions & 0 deletions sdk/cosmos/azure-cosmos-spark_3-1_2-12/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
## Release History

## 4.0.0-beta.2 (Unreleased)

## 4.0.0-beta.1 (2021-03-22)
* Cosmos DB Spark 3.1.1 Connector Preview `4.0.0-beta.1` Release.
### Features
* Supports Spark 3.1.1 and Scala 2.12.
* Integrated against Spark3 DataSourceV2 API.
* Devloped ground up using Cosmos DB Java V4 SDK.
* Added support for Spark Query, Write, and Streaming.
* Added support for Spark3 Catalog metadata APIs.
* Added support for Java V4 Throughput Control.
* Added support for different partitioning strategies.
* Integrated against Cosmos DB TCP protocol.
* Added support for Databricks automated Maven Resolver.
* Added support for broadcasting CosmosClient caches to reduce bootstrapping RU throttling.
* Added support for unified jackson ObjectNode to SparkRow Converter.
* Added support for Raw Json format.
* Added support for Config Validation.
* Added support for Spark application configuration consolidation.
* Integrated against Cosmos DB FeedRange API to support Partition Split Proofing.
* Automated CI testing on DataBricks and Cosmos DB live endpoint.
* Automated CI Testing on Cosmos DB Emulator.

### Known limitations
* Spark structured streaming (micro batches) for consuming change feed has been implemented but not tested end-to-end fully so is considered experimental at this point.
* No support for continuous processing (change feed) yet.
* No perf tests / optimizations have been done yet - we will iterate on perf in the next preview releases. So usage should be limited to non-production environments with this preview.

## 4.0.0-alpha.1 (2021-03-17)
* Cosmos DB Spark 3.1.1 Connector Test Release.
84 changes: 84 additions & 0 deletions sdk/cosmos/azure-cosmos-spark_3-1_2-12/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Contributing
This instruction is guideline for building and code contribution.

## Prequisites
- JDK 11 and above
- [Maven](https://maven.apache.org/) 3.0 and above

## Build from source
To build the project, run maven commands.

```bash
git clone https://github.com/Azure/azure-sdk-for-java.git
cd sdk/cosmos/azure-cosmos-spark_3_2.12
mvnw clean install
```

## Test
There are integration tests on azure and on emulator to trigger integration test execution
against Azure Cosmos DB and against
[Azure Cosmos DB Emulator](https://docs.microsoft.com/azure/cosmos-db/local-emulator), you need to
follow the link to set up emulator before test execution.

- Run unit tests
```bash
mvn clean install -Dgpg.skip
```

- Run integration tests
- on Azure
>**NOTE** Please note that integration test against Azure requires Azure Cosmos DB Document
>API and will automatically create a Cosmos database in your Azure subscription, then there
>will be **Azure usage fee.**
Integration tests will require a Azure Subscription. If you don't already have an Azure
subscription, you can activate your
[MSDN subscriber benefits](https://azure.microsoft.com/pricing/member-offers/msdn-benefits-details/)
or sign up for a [free Azure account](https://azure.microsoft.com/free/).

1. Create an Azure Cosmos DB on Azure.
- Go to [Azure portal](https://portal.azure.com/) and click +New.
- Click Databases, and then click Azure Cosmos DB to create your database.
- Navigate to the database you have created, and click Access keys and copy your
URI and access keys for your database.

2. Set environment variables ACCOUNT_HOST, ACCOUNT_KEY and SECONDARY_ACCOUNT_KEY, where value
of them are Cosmos account URI, primary key and secondary key.

So set the
second group environment variables NEW_ACCOUNT_HOST, NEW_ACCOUNT_KEY and
NEW_SECONDARY_ACCOUNT_KEY, the two group environment variables can be same.
3. Run maven command with `integration-test-azure` profile.

```bash
set ACCOUNT_HOST=your-cosmos-account-uri
set ACCOUNT_KEY=your-cosmos-account-primary-key
set SECONDARY_ACCOUNT_KEY=your-cosmos-account-secondary-key

set NEW_ACCOUNT_HOST=your-cosmos-account-uri
set NEW_ACCOUNT_KEY=your-cosmos-account-primary-key
set NEW_SECONDARY_ACCOUNT_KEY=your-cosmos-account-secondary-key
mvnw -P integration-test-azure clean install
```

- on Emulator

Setup Azure Cosmos DB Emulator by following
[this instruction](https://docs.microsoft.com/azure/cosmos-db/local-emulator), and set
associated environment variables. Then run test with:
```bash
mvnw -P integration-test-emulator install
```


- Skip tests execution
```bash
mvn clean install -Dgpg.skip-DskipTests
```

## Version management
Developing version naming convention is like `0.1.2-beta.1`. Release version naming convention is like `0.1.2`.

## Contribute to code
Contribution is welcome. Please follow
[this instruction](https://github.com/Azure/azure-sdk-for-java/blob/master/CONTRIBUTING.md) to contribute code.
89 changes: 89 additions & 0 deletions sdk/cosmos/azure-cosmos-spark_3-1_2-12/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Azure Cosmos DB OLTP Spark 3 connector

**Azure Cosmos DB OLTP Spark connector preview** provides Apache Spark support for Azure Cosmos DB using
the [SQL API][sql_api_query].
[Azure Cosmos DB][cosmos_introduction] is a globally-distributed database service which allows
developers to work with data using a variety of standard APIs, such as SQL, MongoDB, Cassandra, Graph, and Table.

**NOTE this is a Preview build.
This build has not been load or performance tested yet - and at this point is not recommended
being used in production scenarios.**

If you have any feedback or ideas on how to improve your experience please let us know here:
https://github.com/Azure/azure-sdk-for-java/issues/new

## Documentation

- [Getting started](https://github.com/Azure/azure-sdk-for-java/blob/feature/cosmos/spark30/sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/quick-start.md)
- [Catalog API](https://github.com/Azure/azure-sdk-for-java/blob/feature/cosmos/spark30/sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/catalog-api.md)
- [Configuration Parameter Reference](https://github.com/Azure/azure-sdk-for-java/blob/feature/cosmos/spark30/sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/configuration-reference.md)

[//]: # (//TODO: moderakh add more sections)
[//]: # (//TODO: moderakh Enable Client Logging)
[//]: # (//TODO: moderakh Examples)
[//]: # (//TODO: moderakh Next steps)
[//]: # (//TODO: moderakh Key concepts)
[//]: # (//TODO: moderakh Azure Cosmos DB Partition)
[//]: # (//TODO: moderakh Troubleshooting)

## Version Compatibility

| Connector | Spark | Minimum Java Version | Supported Scala Versions |
| ------------- | ------------- | -------------------- | ----------------------- |
| 4.0.0-beta.1 | 3.1.1 | 8 | 2.12 |

## Download

You can use the maven coordinate of the jar to auto install the Spark Connector to your Databricks Runtime 8 from Maven:
`com.azure.cosmos.spark:azure-cosmos-spark_3-1_2-12:4.0.0-beta.1`

You can also integrate against Cosmos DB Spark Connector in your SBT project:
```scala
libraryDependencies += "com.azure.cosmos.spark" % "azure-cosmos-spark_3-1_2-12" % "4.0.0-beta.1"
```

Cosmos DB Spark Connector is available on [Maven Central Repo](https://search.maven.org/artifact/com.azure.cosmos.spark/azure-cosmos-spark_3-1_2-12/4.0.0-beta.1/jar).

### General

If you encounter any bug, please file an issue [here](https://github.com/Azure/azure-sdk-for-java/issues/new).

To suggest a new feature or changes that could be made, file an issue the same way you would for a bug.

## License
This project is under MIT license and uses and repackages other third party libraries as an uber jar.
See [NOTICE.txt](https://github.com/Azure/azure-sdk-for-java/blob/feature/cosmos/spark30/NOTICE.txt).

## Contributing

This project welcomes contributions and suggestions. Most contributions require you to agree to a
[Contributor License Agreement (CLA)][cla] declaring that you have the right to, and actually do, grant us the rights
to use your contribution.

When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate
the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to
do this once across all repos using our CLA.

This project has adopted the [Microsoft Open Source Code of Conduct][coc]. For more information see the [Code of Conduct FAQ][coc_faq]
or contact [opencode@microsoft.com][coc_contact] with any additional questions or comments.

<!-- LINKS -->
[source_code]: src
[cosmos_introduction]: https://docs.microsoft.com/azure/cosmos-db/
[cosmos_docs]: https://docs.microsoft.com/azure/cosmos-db/introduction
[jdk]: https://docs.microsoft.com/java/azure/jdk/?view=azure-java-stable
[maven]: https://maven.apache.org/
[cla]: https://cla.microsoft.com
[coc]: https://opensource.microsoft.com/codeofconduct/
[coc_faq]: https://opensource.microsoft.com/codeofconduct/faq/
[coc_contact]: mailto:opencode@microsoft.com
[azure_subscription]: https://azure.microsoft.com/free/
[samples]: https://github.com/Azure/azure-sdk-for-java/tree/master/sdk/cosmos/azure-spring-data-cosmos/src/samples/java/com/azure/spring/data/cosmos
[sql_api_query]: https://docs.microsoft.com/azure/cosmos-db/sql-api-sql-query
[local_emulator]: https://docs.microsoft.com/azure/cosmos-db/local-emulator
[local_emulator_export_ssl_certificates]: https://docs.microsoft.com/azure/cosmos-db/local-emulator-export-ssl-certificates
[azure_cosmos_db_partition]: https://docs.microsoft.com/azure/cosmos-db/partition-data
[sql_queries_in_cosmos]: https://docs.microsoft.com/azure/cosmos-db/tutorial-query-sql-api
[sql_queries_getting_started]: https://docs.microsoft.com/azure/cosmos-db/sql-query-getting-started


Loading

0 comments on commit 852122a

Please sign in to comment.