fix(libraries-release-data): add service name lookup from artifact id…

… for gapic libraries, add apiary libraries and spring cloud gcp to release data table. (#6437) Changes in this pr: - Add script to infer artifact id to service name match from StubSettings. (closing #6390 in favor of this pr) - Add script to parse artifact id and service name for apiary libraries from discovery docs. - Manually add spring-cloud-gcp to tracked library list. Note that service name column is borrowed here for tool name. (this is to match tool release dates) - revamped `fetch-library-data.sh` to require less strict structure in the returned maven page, so it can be reused for apiary lists. - Other minor cleanups. Match sample analysis: From sampled services from 2023-01-01 to now, Before this change 145/235 did not have a match, after this change 53/235 missing match. future enhancements: To infer gapic libraries artifact id and service name closer to source (from googleapis yaml/protos).
googleapis · Feb 16, 2024 · 9491edf · 9491edf
1 parent ac9893c
commit 9491edf
Show file tree

Hide file tree

Showing 4 changed files with 178 additions and 28 deletions.
diff --git a/.kokoro/nightly/create-versions-csv.sh b/.kokoro/nightly/create-versions-csv.sh
@@ -3,7 +3,7 @@
 # Output:
 # The script generates cloud_java_client_library_release_dates.csv that holds the data defined below.
 # It has artifact_id,service_name,version, and release_date columns.
-# this csv file will be uploaded to (project) cloud-java-metrics.(dataset) client_library_versions. (table) cloud_java_client_library_release_dates
+# this csv file is uploaded to (project) cloud-java-metrics.(dataset) client_library_versions. (table) cloud_java_client_library_release_dates
 # using bq load command
 
 # Fail on any error.
@@ -13,6 +13,10 @@ set -x
 
 cd github/java-cloud-bom
 
+# prepare list of artifact id and service name match
+.kokoro/nightly/get-service-names.sh
+.kokoro/nightly/get-apiary-service-names.sh
+
 mvn -B clean install
 
 cd libraries-release-data
@@ -30,32 +34,62 @@ sed -i '/libraries-release-data/d' unfiltered-libraries.txt
 sort unfiltered-libraries.txt | uniq > libraries.txt
 rm -f unfiltered-libraries.txt
 
+service_file="artifacts_to_services.txt"
 
 cat libraries.txt | while read line; do
 
   group_id=${line%:*}
   artifact_id=${line#*:}
   new_group_id="${group_id//.//}"
-  service_name=${artifact_id#*-cloud-}
-
-  if [[ "${artifact_id}" == google-cloud-storage ]]; then
-    service_name=bigstore
+  # Check if artifactId contains "emulator"
+  if [[ $artifact_id =~ .*emulator.* ]]; then
+      echo "artifactId contains 'emulator': $artifactId"
+      continue
   fi
-  if [[ "${artifact_id}" == google-cloud-storage-transfer ]]; then
-    service_name=storagetransfer
+  service_name=$(grep "^${artifact_id}," "$service_file" | cut -d ',' -f 2)
+  if [[ -n $service_name ]]; then
+      echo "Service Name found: $service_name"
+  else
+      echo "No matching service name found for artifactId: $artifact_id"
   fi
 
   URL=https://repo1.maven.org/maven2/$new_group_id/$artifact_id
 
-  ../.kokoro/nightly/fetch-library-data.sh $URL $artifact_id $service_name
+  ../.kokoro/nightly/fetch-library-data.sh $URL $artifact_id $service_name >> cloud_java_client_library_release_dates.csv
 
 done
 
+# apiary list
+
+sort artifacts_to_services_apiary.txt | uniq > artifacts_to_services_apiary_uniq.txt
+
+apiary_list="artifacts_to_services_apiary_uniq.txt"
+
+# Read the input file line by line
+while IFS= read -r line; do
+    # Split line into values using comma as delimiter
+    IFS=',' read -r -a values <<< "$line"
+    group_id=${values[0]}
+    artifact_id=${values[1]}
+    service_name=${values[2]}
+    new_group_id="${group_id//./\/}"
+    URL=https://repo1.maven.org/maven2/$new_group_id/$artifact_id
+    ../.kokoro/nightly/fetch-library-data.sh $URL $artifact_id $service_name >> cloud_java_client_library_release_dates.csv
+done < "$apiary_list"
+
+# add spring cloud gcp, "service_name" is tool_name
+../.kokoro/nightly/fetch-library-data.sh https://repo1.maven.org/maven2/com/google/cloud/spring-cloud-gcp-dependencies/ spring-cloud-gcp-dependencies spring-cloud-gcp >> cloud_java_client_library_release_dates.csv
+../.kokoro/nightly/fetch-library-data.sh https://repo1.maven.org/maven2/org/springframework/cloud/spring-cloud-gcp-dependencies/ spring-cloud-gcp-dependencies spring-cloud-gcp >> cloud_java_client_library_release_dates.csv
+
 rm -f libraries.txt
+rm -f artifacts_to_services_apiary.txt
+rm -f "$apiary_list"
 
-sed 's/ \+/,/g' cloud_java_client_library_release_dates_tsv.txt > cloud_java_client_library_release_dates.csv
 sed -i '1s/^/version,release_date,artifact_id,service_name\n/' cloud_java_client_library_release_dates.csv
 
+# remove where service match not found
+sed -i '/,$/d' cloud_java_client_library_release_dates.csv
+
 echo "Inserting client_library_versions.cloud_java_client_library_release_dates. First 10 lines:"
 head  cloud_java_client_library_release_dates.csv
 echo "===================="
@@ -64,6 +98,5 @@ bq load --skip_leading_rows=1 --project_id=cloud-java-metrics --source_format=CS
 client_library_versions.cloud_java_client_library_release_dates \
 cloud_java_client_library_release_dates.csv
 
-
-rm -f cloud_java_client_library_release_dates_tsv.txt
 rm -f cloud_java_client_library_release_dates.csv
+rm -f artifacts_to_services.txt
diff --git a/.kokoro/nightly/fetch-library-data.sh b/.kokoro/nightly/fetch-library-data.sh
@@ -3,8 +3,8 @@
 # input for this script will be URL, artifact_id and service_name
 # example: https://repo1.maven.org/maven2/com/google/cloud/google-cloud-vision google-cloud-vision vision
 
-# output: cloud_java_client_library_release_dates_tsv.txt which contains
-# artifact_id,service_name,version, and release_date for the artifacts (without the column headers)
+# output: a line in the format of
+# artifact_id,service_name,version, and release_date for the artifacts
 
 mavenCentralURL=$1
 artifact_id=$2
@@ -15,20 +15,22 @@ wget -O mavenFile --referer --recursive -nd --no-parent \
   --header="User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" \
   ${1}
 
-grep -E '<a href=".*">' mavenFile > mavenContents.txt
+outputFile="maven_versions_and_dates.txt"
+# assume semantic versions, starting with number. Get lines from file that looks like
+# '<a href="0.10.0-beta/" title="0.10.0-beta/">0.10.0-beta/</a>                                      2017-03-17 00:01         -     '
+grep -E '<a href=\"[0-9]|[a-z].*\"\s' mavenFile | \
+grep -v -E '(metadata)|(meta name)' | \
+# remove  content between '/" title=' and '</a>'
+sed -e 's/\/"\stitle=.*<\/a>//' | \
+# remove content before version
+sed -e 's/<a href=\"//' | \
+# replace multiple spaces
+sed -E 's/[[:space:]]{3,}/;/g' | \
+# get version and date only
+awk -F'[ ;]' '{print $1, $2}' | \
+# insert artifact_id and service_name
+awk '{$3=a}1' a="${artifact_id}" | \
+awk '{$4=b}1' b="${service_name}" | \
+sed 's/ \+/,/g'
 
-awk  '/a/ {print  $2 "\t" $4}'  mavenContents.txt > finalContents.txt
-sed -i 1d  finalContents.txt
-sed -i '/maven-metadata/d' finalContents.txt
-sed -i 's/href="//g' finalContents.txt
-sed -i 's/"//g' finalContents.txt
-sed -i 's|/||g' finalContents.txt
-awk '{$3=a}1' a="${2}" finalContents.txt > newfile.txt
-awk '{$4=b}1' b="${3}" newfile.txt > final.txt
-cat final.txt >> cloud_java_client_library_release_dates_tsv.txt
-
-rm -f final.txt
-rm -f newfile.txt
 rm -f mavenFile
-rm -f mavenContents.txt
-rm -f finalContents.txt
diff --git a/.kokoro/nightly/get-apiary-service-names.sh b/.kokoro/nightly/get-apiary-service-names.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# This script should download
+# discovery docs: git@github.com:googleapis/discovery-artifact-manager.git
+# apiary repo git@github.com:googleapis/google-api-java-client-services.git
+# from discovery docs for each service
+# parse artifact-id ("name") and service name ("rootUrl").
+
+# Run this script from repo root dir
+# input: N/A
+# output: txt file with comma separated group_id, artifact_id, service_name.
+
+git clone https://github.com/googleapis/discovery-artifact-manager.git
+
+cd ./discovery-artifact-manager/discoveries || exit
+output_filename="../../libraries-release-data/artifacts_to_services_apiary.txt"
+
+# install jq to extract info from JSON data
+sudo apt-get update
+sudo apt-get install -q -y jq
+
+# loop through dicovery json files
+for file in *.json; do
+    # Use jq to extract the "name" field
+
+    # group_id logic: https://github.com/googleapis/google-api-java-client-services/blob/421c5d6ed56d5eb1257d3fc057d7d6b4fd2f9bb7/generator/src/googleapis/codegen/utilities/maven_utils.py#L50
+    # artifact_id logic: https://github.com/googleapis/google-api-java-client-services/blob/421c5d6ed56d5eb1257d3fc057d7d6b4fd2f9bb7/generator/src/googleapis/codegen/utilities/maven_utils.py#L42-L47
+    # default_host https://github.com/googleapis/discovery-artifact-manager/blob/9f6638a9950991d4fe67d75bdb539e6d2be20541/google-api-client-generator/src/googleapis/codegen/languages/java/default/templates/___package___/___api_className___.java.tmpl#L44
+    artifact_id_suffix=$(jq -r '.name' "$file")
+    default_host=$(jq -r '.rootUrl' "$file")
+    owner_domain=$(jq -r '.ownerDomain' "$file")
+
+    if [[ "$default_host" =~ ^https:// ]] && [ -n "$artifact_id_suffix" ] && [ -n "$owner_domain" ]; then
+      if [[ "$owner_domain" != 'google.com' ]]; then
+        echo "$owner_domain =============="
+        continue
+      fi
+      group_id="com.google.apis"
+      service_name=$(echo "$default_host" | cut -d'/' -f3 | cut -d'.' -f1)
+      artifact_id="google-api-services-${artifact_id_suffix}"
+      echo "${group_id},${artifact_id},${service_name}" >> "$output_filename"
+    else
+        echo "$default_host: Not a valid URL or No 'name' field found in $file"
+    fi
+
+done
+
+cd ../..
+
+rm -rf discovery-artifact-manager/
diff --git a/.kokoro/nightly/get-service-names.sh b/.kokoro/nightly/get-service-names.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# This scripts downloads google-cloud-java repo, loop through modules with names starting "java-",
+# grabs artifactId from pom.xml file within submodule name starting with "google-", and
+# service name from *StubSettings.java file.
+
+# Run this script from repo root dir
+# input: N/A
+# output: txt file with comma separated artifact_id, service_name.
+
+git clone https://github.com/googleapis/google-cloud-java.git
+
+cd ./google-cloud-java || exit
+filename="../artifacts_to_services.txt"
+
+for module in $(find . -mindepth 2 -maxdepth 2 -name pom.xml | sort | xargs dirname); do
+    echo "module: ${module}"
+    #  Only modules starting with java- contain client library artifacts.
+    if [[ ${module} != ./java-* ]]; then
+      echo "not a client library, continue..."
+      continue
+    fi
+    # special cases, add manually later.
+    if [[ ${module} == ./java-dns ]] || [[ ${module} == ./java-grafeas ]] || [[ ${module} == ./java-notification ]] || [[ ${module} == ./java-alloydb-connectors ]]; then
+      continue
+    fi
+    cd "${module}" || exit
+    #  Find submodule with name starting with "google-", this is to exclude proto, grpc and bom folders,
+    #  and locate artifact id of client library
+    folder=$(find . -mindepth 1 -maxdepth 1 -type d -name "google-*" ! -name "*-bom" )
+    echo "folder: ${folder}"
+    cd "${folder}" || continue
+    artifact_id_string=$(find . -name 'pom.xml' -print -quit | xargs grep -m 1 '<artifactId>' | cut -d '>' -f 2 | cut -d '<' -f 1)
+    echo "artifact_id_string: ${artifact_id_string}"
+    cd .. # exist from folder ${folder}
+
+    # Find *StubSettings file, get the first line containing '.googleapis.com:443'
+    # Extract service name from it
+    string=$(find . -name '*StubSettings.java' -print -quit | xargs grep -m 1 '.googleapis.com:443')
+    service_name=$(echo "${string}" | grep -o '".*"' | tr -d '"' | cut -d "." -f 1 | cut -d "-" -f 1)
+    echo "service name: ${service_name}"
+    echo "${artifact_id_string}, ${service_name}" >> "$filename"
+    cd .. # exit from ${module}
+done
+
+# add handwritten libraries manually.
+{
+  echo "google-cloud-bigquery, bigquery"
+  echo "google-cloud-bigtable, bigtable"
+  echo "google-cloud-bigquerystorage, bigquerystorage"
+  echo "google-cloud-datastore, datastore"
+  echo "google-cloud-firestore, firestore"
+  echo "google-cloud-logging, logging"
+  echo "google-cloud-pubsub, pubsub"
+  echo "google-cloud-pubsublite, pubsublite"
+  echo "google-cloud-storage, bigstore"
+  echo "google-cloud-storage-control, storage"
+  echo "google-cloud-spanner, spanner"
+  echo "google-cloud-dns, dns"
+} >> "./artifacts_to_services.txt"
+
+cd ..
+mv ./google-cloud-java/artifacts_to_services.txt ./libraries-release-data/artifacts_to_services.txt
+# clean up
+rm -rf google-cloud-java/