Skip to content

Commit

Permalink
fix(libraries-release-data): add service name lookup from artifact id…
Browse files Browse the repository at this point in the history
… for gapic libraries, add apiary libraries and spring cloud gcp to release data table. (#6437)

Changes in this pr:
- Add script to infer artifact id to service name match from StubSettings. (closing #6390 in favor of this pr) 
- Add script to parse artifact id and service name for apiary libraries from discovery docs.
- Manually add spring-cloud-gcp to tracked library list. Note that service name column is borrowed here for tool name. (this is to match tool release dates)
- revamped `fetch-library-data.sh` to require less strict structure in the returned maven page, so it can be reused for apiary lists.
- Other minor cleanups.

Match sample analysis: 
From sampled services from 2023-01-01 to now, 
Before this change 145/235 did not have a match, after this change 53/235 missing match. 

future enhancements:
To infer gapic libraries artifact id and service name closer to source (from googleapis yaml/protos).
  • Loading branch information
zhumin8 authored Feb 16, 2024
1 parent ac9893c commit 9491edf
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 28 deletions.
55 changes: 44 additions & 11 deletions .kokoro/nightly/create-versions-csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Output:
# The script generates cloud_java_client_library_release_dates.csv that holds the data defined below.
# It has artifact_id,service_name,version, and release_date columns.
# this csv file will be uploaded to (project) cloud-java-metrics.(dataset) client_library_versions. (table) cloud_java_client_library_release_dates
# this csv file is uploaded to (project) cloud-java-metrics.(dataset) client_library_versions. (table) cloud_java_client_library_release_dates
# using bq load command

# Fail on any error.
Expand All @@ -13,6 +13,10 @@ set -x

cd github/java-cloud-bom

# prepare list of artifact id and service name match
.kokoro/nightly/get-service-names.sh
.kokoro/nightly/get-apiary-service-names.sh

mvn -B clean install

cd libraries-release-data
Expand All @@ -30,32 +34,62 @@ sed -i '/libraries-release-data/d' unfiltered-libraries.txt
sort unfiltered-libraries.txt | uniq > libraries.txt
rm -f unfiltered-libraries.txt

service_file="artifacts_to_services.txt"

cat libraries.txt | while read line; do

group_id=${line%:*}
artifact_id=${line#*:}
new_group_id="${group_id//.//}"
service_name=${artifact_id#*-cloud-}

if [[ "${artifact_id}" == google-cloud-storage ]]; then
service_name=bigstore
# Check if artifactId contains "emulator"
if [[ $artifact_id =~ .*emulator.* ]]; then
echo "artifactId contains 'emulator': $artifactId"
continue
fi
if [[ "${artifact_id}" == google-cloud-storage-transfer ]]; then
service_name=storagetransfer
service_name=$(grep "^${artifact_id}," "$service_file" | cut -d ',' -f 2)
if [[ -n $service_name ]]; then
echo "Service Name found: $service_name"
else
echo "No matching service name found for artifactId: $artifact_id"
fi

URL=https://repo1.maven.org/maven2/$new_group_id/$artifact_id

../.kokoro/nightly/fetch-library-data.sh $URL $artifact_id $service_name
../.kokoro/nightly/fetch-library-data.sh $URL $artifact_id $service_name >> cloud_java_client_library_release_dates.csv

done

# apiary list

sort artifacts_to_services_apiary.txt | uniq > artifacts_to_services_apiary_uniq.txt

apiary_list="artifacts_to_services_apiary_uniq.txt"

# Read the input file line by line
while IFS= read -r line; do
# Split line into values using comma as delimiter
IFS=',' read -r -a values <<< "$line"
group_id=${values[0]}
artifact_id=${values[1]}
service_name=${values[2]}
new_group_id="${group_id//./\/}"
URL=https://repo1.maven.org/maven2/$new_group_id/$artifact_id
../.kokoro/nightly/fetch-library-data.sh $URL $artifact_id $service_name >> cloud_java_client_library_release_dates.csv
done < "$apiary_list"

# add spring cloud gcp, "service_name" is tool_name
../.kokoro/nightly/fetch-library-data.sh https://repo1.maven.org/maven2/com/google/cloud/spring-cloud-gcp-dependencies/ spring-cloud-gcp-dependencies spring-cloud-gcp >> cloud_java_client_library_release_dates.csv
../.kokoro/nightly/fetch-library-data.sh https://repo1.maven.org/maven2/org/springframework/cloud/spring-cloud-gcp-dependencies/ spring-cloud-gcp-dependencies spring-cloud-gcp >> cloud_java_client_library_release_dates.csv

rm -f libraries.txt
rm -f artifacts_to_services_apiary.txt
rm -f "$apiary_list"

sed 's/ \+/,/g' cloud_java_client_library_release_dates_tsv.txt > cloud_java_client_library_release_dates.csv
sed -i '1s/^/version,release_date,artifact_id,service_name\n/' cloud_java_client_library_release_dates.csv

# remove where service match not found
sed -i '/,$/d' cloud_java_client_library_release_dates.csv

echo "Inserting client_library_versions.cloud_java_client_library_release_dates. First 10 lines:"
head cloud_java_client_library_release_dates.csv
echo "===================="
Expand All @@ -64,6 +98,5 @@ bq load --skip_leading_rows=1 --project_id=cloud-java-metrics --source_format=CS
client_library_versions.cloud_java_client_library_release_dates \
cloud_java_client_library_release_dates.csv


rm -f cloud_java_client_library_release_dates_tsv.txt
rm -f cloud_java_client_library_release_dates.csv
rm -f artifacts_to_services.txt
36 changes: 19 additions & 17 deletions .kokoro/nightly/fetch-library-data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# input for this script will be URL, artifact_id and service_name
# example: https://repo1.maven.org/maven2/com/google/cloud/google-cloud-vision google-cloud-vision vision

# output: cloud_java_client_library_release_dates_tsv.txt which contains
# artifact_id,service_name,version, and release_date for the artifacts (without the column headers)
# output: a line in the format of
# artifact_id,service_name,version, and release_date for the artifacts

mavenCentralURL=$1
artifact_id=$2
Expand All @@ -15,20 +15,22 @@ wget -O mavenFile --referer --recursive -nd --no-parent \
--header="User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" \
${1}

grep -E '<a href=".*">' mavenFile > mavenContents.txt
outputFile="maven_versions_and_dates.txt"
# assume semantic versions, starting with number. Get lines from file that looks like
# '<a href="0.10.0-beta/" title="0.10.0-beta/">0.10.0-beta/</a> 2017-03-17 00:01 - '
grep -E '<a href=\"[0-9]|[a-z].*\"\s' mavenFile | \
grep -v -E '(metadata)|(meta name)' | \
# remove content between '/" title=' and '</a>'
sed -e 's/\/"\stitle=.*<\/a>//' | \
# remove content before version
sed -e 's/<a href=\"//' | \
# replace multiple spaces
sed -E 's/[[:space:]]{3,}/;/g' | \
# get version and date only
awk -F'[ ;]' '{print $1, $2}' | \
# insert artifact_id and service_name
awk '{$3=a}1' a="${artifact_id}" | \
awk '{$4=b}1' b="${service_name}" | \
sed 's/ \+/,/g'

awk '/a/ {print $2 "\t" $4}' mavenContents.txt > finalContents.txt
sed -i 1d finalContents.txt
sed -i '/maven-metadata/d' finalContents.txt
sed -i 's/href="//g' finalContents.txt
sed -i 's/"//g' finalContents.txt
sed -i 's|/||g' finalContents.txt
awk '{$3=a}1' a="${2}" finalContents.txt > newfile.txt
awk '{$4=b}1' b="${3}" newfile.txt > final.txt
cat final.txt >> cloud_java_client_library_release_dates_tsv.txt

rm -f final.txt
rm -f newfile.txt
rm -f mavenFile
rm -f mavenContents.txt
rm -f finalContents.txt
50 changes: 50 additions & 0 deletions .kokoro/nightly/get-apiary-service-names.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# This script should download
# discovery docs: git@github.com:googleapis/discovery-artifact-manager.git
# apiary repo git@github.com:googleapis/google-api-java-client-services.git
# from discovery docs for each service
# parse artifact-id ("name") and service name ("rootUrl").

# Run this script from repo root dir
# input: N/A
# output: txt file with comma separated group_id, artifact_id, service_name.

git clone https://github.com/googleapis/discovery-artifact-manager.git

cd ./discovery-artifact-manager/discoveries || exit
output_filename="../../libraries-release-data/artifacts_to_services_apiary.txt"

# install jq to extract info from JSON data
sudo apt-get update
sudo apt-get install -q -y jq

# loop through dicovery json files
for file in *.json; do
# Use jq to extract the "name" field

# group_id logic: https://github.com/googleapis/google-api-java-client-services/blob/421c5d6ed56d5eb1257d3fc057d7d6b4fd2f9bb7/generator/src/googleapis/codegen/utilities/maven_utils.py#L50
# artifact_id logic: https://github.com/googleapis/google-api-java-client-services/blob/421c5d6ed56d5eb1257d3fc057d7d6b4fd2f9bb7/generator/src/googleapis/codegen/utilities/maven_utils.py#L42-L47
# default_host https://github.com/googleapis/discovery-artifact-manager/blob/9f6638a9950991d4fe67d75bdb539e6d2be20541/google-api-client-generator/src/googleapis/codegen/languages/java/default/templates/___package___/___api_className___.java.tmpl#L44
artifact_id_suffix=$(jq -r '.name' "$file")
default_host=$(jq -r '.rootUrl' "$file")
owner_domain=$(jq -r '.ownerDomain' "$file")

if [[ "$default_host" =~ ^https:// ]] && [ -n "$artifact_id_suffix" ] && [ -n "$owner_domain" ]; then
if [[ "$owner_domain" != 'google.com' ]]; then
echo "$owner_domain =============="
continue
fi
group_id="com.google.apis"
service_name=$(echo "$default_host" | cut -d'/' -f3 | cut -d'.' -f1)
artifact_id="google-api-services-${artifact_id_suffix}"
echo "${group_id},${artifact_id},${service_name}" >> "$output_filename"
else
echo "$default_host: Not a valid URL or No 'name' field found in $file"
fi

done

cd ../..

rm -rf discovery-artifact-manager/
65 changes: 65 additions & 0 deletions .kokoro/nightly/get-service-names.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash

# This scripts downloads google-cloud-java repo, loop through modules with names starting "java-",
# grabs artifactId from pom.xml file within submodule name starting with "google-", and
# service name from *StubSettings.java file.

# Run this script from repo root dir
# input: N/A
# output: txt file with comma separated artifact_id, service_name.

git clone https://github.com/googleapis/google-cloud-java.git

cd ./google-cloud-java || exit
filename="../artifacts_to_services.txt"

for module in $(find . -mindepth 2 -maxdepth 2 -name pom.xml | sort | xargs dirname); do
echo "module: ${module}"
# Only modules starting with java- contain client library artifacts.
if [[ ${module} != ./java-* ]]; then
echo "not a client library, continue..."
continue
fi
# special cases, add manually later.
if [[ ${module} == ./java-dns ]] || [[ ${module} == ./java-grafeas ]] || [[ ${module} == ./java-notification ]] || [[ ${module} == ./java-alloydb-connectors ]]; then
continue
fi
cd "${module}" || exit
# Find submodule with name starting with "google-", this is to exclude proto, grpc and bom folders,
# and locate artifact id of client library
folder=$(find . -mindepth 1 -maxdepth 1 -type d -name "google-*" ! -name "*-bom" )
echo "folder: ${folder}"
cd "${folder}" || continue
artifact_id_string=$(find . -name 'pom.xml' -print -quit | xargs grep -m 1 '<artifactId>' | cut -d '>' -f 2 | cut -d '<' -f 1)
echo "artifact_id_string: ${artifact_id_string}"
cd .. # exist from folder ${folder}

# Find *StubSettings file, get the first line containing '.googleapis.com:443'
# Extract service name from it
string=$(find . -name '*StubSettings.java' -print -quit | xargs grep -m 1 '.googleapis.com:443')
service_name=$(echo "${string}" | grep -o '".*"' | tr -d '"' | cut -d "." -f 1 | cut -d "-" -f 1)
echo "service name: ${service_name}"
echo "${artifact_id_string}, ${service_name}" >> "$filename"
cd .. # exit from ${module}
done

# add handwritten libraries manually.
{
echo "google-cloud-bigquery, bigquery"
echo "google-cloud-bigtable, bigtable"
echo "google-cloud-bigquerystorage, bigquerystorage"
echo "google-cloud-datastore, datastore"
echo "google-cloud-firestore, firestore"
echo "google-cloud-logging, logging"
echo "google-cloud-pubsub, pubsub"
echo "google-cloud-pubsublite, pubsublite"
echo "google-cloud-storage, bigstore"
echo "google-cloud-storage-control, storage"
echo "google-cloud-spanner, spanner"
echo "google-cloud-dns, dns"
} >> "./artifacts_to_services.txt"

cd ..
mv ./google-cloud-java/artifacts_to_services.txt ./libraries-release-data/artifacts_to_services.txt
# clean up
rm -rf google-cloud-java/

0 comments on commit 9491edf

Please sign in to comment.