From ce913efea3d3a77bdebfc3154ed132b0560d5c11 Mon Sep 17 00:00:00 2001 From: Evis Drenova <80707987+evisdrenova@users.noreply.github.com> Date: Thu, 23 May 2024 17:04:30 -0700 Subject: [PATCH] NEOS-1107: seo updates (#2018) --- frontend/apps/web/app/(mgmt)/layout.tsx | 2 +- frontend/apps/web/app/invite/layout.tsx | 2 +- marketing/app/about/Hero.tsx | 4 +- marketing/app/about/page.tsx | 7 +- marketing/app/blog/page.tsx | 16 +++-- marketing/app/layout.tsx | 11 +++- marketing/app/msa/page.tsx | 27 +++++++- marketing/app/page.tsx | 9 ++- marketing/app/pricing/page.tsx | 65 ++++++++++++++----- marketing/app/privacy-policy/page.tsx | 27 +++++++- marketing/app/sitemap.ts | 1 - .../fix-staging-environments/hero.tsx | 4 +- .../fix-staging-environments/page.tsx | 2 + .../reproduce-prod-bugs-locally/hero.tsx | 4 +- .../reproduce-prod-bugs-locally/page.tsx | 2 + .../app/solutions/security-privacy/hero.tsx | 4 +- .../app/solutions/security-privacy/page.tsx | 2 + .../unblock-local-development/hero.tsx | 4 +- .../unblock-local-development/page.tsx | 2 + marketing/app/terms-of-service/page.tsx | 29 ++++++++- marketing/components/landing-page/Hero.tsx | 7 +- .../content/blog/3-types-of-deployments.mdx | 14 ++-- .../ai-driven-synthetic-data-generation.mdx | 18 ++--- marketing/content/blog/github-oss-metrics.mdx | 34 +++++----- .../content/blog/how-does-pg-dump-work.mdx | 16 ++--- .../blog/introducing-custom-transformers.mdx | 12 ++-- .../content/blog/introducing-neosync.mdx | 14 ++-- marketing/content/blog/log-me-out.mdx | 2 +- marketing/content/blog/neosync-benchmarks.mdx | 18 ++--- .../content/blog/neosync-llm-integration.mdx | 20 +++--- marketing/content/blog/neosync-migration.mdx | 24 +++---- .../blog/neosync-neon-data-gen-job.mdx | 16 ++--- .../content/blog/neosync-neon-sync-job.mdx | 16 ++--- .../content/blog/neosync-rds-data-gen-job.mdx | 16 ++--- .../content/blog/neosync-rds-sync-job.mdx | 16 ++--- .../blog/neosync-supabase-data-gen-job.mdx | 16 ++--- .../blog/neosync-supabase-sync-job.mdx | 16 ++--- .../content/blog/pganonmizer-alternatives.mdx | 16 ++--- .../content/blog/referential-integrity.mdx | 14 ++-- .../blog/subset-referential-integrity.mdx | 18 ++--- ...synthetic-data-encryption-tokenization.mdx | 16 ++--- .../blog/synthetic-data-engineering.mdx | 16 ++--- .../blog/synthetic-data-tokenization.mdx | 22 +++---- marketing/content/blog/terraform-support.mdx | 10 +-- .../blog/top-4-usecases-synthetic-data.mdx | 20 +++--- .../content/blog/what-is-pg-anonymizer.mdx | 22 +++---- .../blog/what-is-platform-engineering.mdx | 20 +++--- marketing/content/blog/what-is-test-data.mdx | 16 ++--- 48 files changed, 410 insertions(+), 279 deletions(-) diff --git a/frontend/apps/web/app/(mgmt)/layout.tsx b/frontend/apps/web/app/(mgmt)/layout.tsx index ab2dd60ea1..a17f113226 100644 --- a/frontend/apps/web/app/(mgmt)/layout.tsx +++ b/frontend/apps/web/app/(mgmt)/layout.tsx @@ -10,7 +10,7 @@ import BaseLayout from '../BaseLayout'; export const metadata: Metadata = { title: 'Neosync', - description: 'Open Source Test Data Management', + description: 'Open Source Data Anonymization and Synthetic Data', icons: [{ rel: 'icon', url: '/favicon.ico' }], }; diff --git a/frontend/apps/web/app/invite/layout.tsx b/frontend/apps/web/app/invite/layout.tsx index f4d5436883..c8bcc13547 100644 --- a/frontend/apps/web/app/invite/layout.tsx +++ b/frontend/apps/web/app/invite/layout.tsx @@ -10,7 +10,7 @@ import BaseLayout from '../BaseLayout'; export const metadata: Metadata = { title: 'Neosync', - description: 'Open Source Test Data Management', + description: 'Open Source Data Anonymization and Synthetic Data', icons: [{ rel: 'icon', url: '/favicon.ico' }], }; diff --git a/marketing/app/about/Hero.tsx b/marketing/app/about/Hero.tsx index 002d4975cc..403c743215 100644 --- a/marketing/app/about/Hero.tsx +++ b/marketing/app/about/Hero.tsx @@ -10,9 +10,9 @@ export default function Hero(): ReactElement {
logo
-
+

The Future is Synthetic Data Engineering -

+
Synthetic Data Engineering represents the next step in customer data security and privacy. Imagine having endless data, at your fingertips, diff --git a/marketing/app/about/page.tsx b/marketing/app/about/page.tsx index 771c2db367..5e75c659d3 100644 --- a/marketing/app/about/page.tsx +++ b/marketing/app/about/page.tsx @@ -7,10 +7,11 @@ import Values from './Values'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), - title: 'Neosync | About', + title: `Neosync | Learn more about Neosync's mission and values`, + description: `Neosync is mission-focused on delivering synthetic data engineering and data anonymization. Learn more about our journey and values.`, openGraph: { - title: 'Neosync | About', - description: `Learn more about Neosync's mission and values `, + title: `Neosync | Learn more about Neosync's mission and values`, + description: `Neosync is mission-focused on delivering synthetic data engineering and data anonymization. Learn more about our journey and values.`, url: 'https://www.neosync.dev/about', siteName: 'Neosync', images: [ diff --git a/marketing/app/blog/page.tsx b/marketing/app/blog/page.tsx index 740e68646c..73fc381883 100644 --- a/marketing/app/blog/page.tsx +++ b/marketing/app/blog/page.tsx @@ -13,10 +13,13 @@ import { Metadata } from 'next'; import { ReactElement } from 'react'; export const metadata: Metadata = { - title: 'Neosync | Blog', + title: 'Neosync | Blogs on data anonymization and synthetic data.', + description: + 'Read the Neosync Blog to learn more about data anonymization, synthetic data and to be kept up to date on product announcements.', openGraph: { - title: 'Neosync', - description: 'Blog', + title: 'Neosync | Blogs on data anonymization and synthetic data.', + description: + 'Read the Neosync Blog to learn more about data anonymization, synthetic data and to be kept up to date on product announcements.', url: 'https://www.neosync.dev/blog', siteName: 'Neosync', images: [ @@ -46,7 +49,12 @@ export default async function BlogPage(): Promise { return (
- {headerPost && } +

+ Blogs +

+
+ {headerPost && } +

{posts?.length ? (
diff --git a/marketing/app/layout.tsx b/marketing/app/layout.tsx index 401465bb33..e94f254937 100644 --- a/marketing/app/layout.tsx +++ b/marketing/app/layout.tsx @@ -11,9 +11,13 @@ import PHProvider, { PostHogPageview } from './providers'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), + title: 'Neosync | Open Source Data Anonymization and Synthetic Data', + description: + 'Neosync is an open source data anonymization and synthetic data generation platform for developers', openGraph: { - title: 'Neosync', - description: 'Open Source Synthetic Data Orchestration', + title: 'Neosync | Open Source Data Anonymization and Synthetic Data', + description: + 'Neosync is an open source data anonymization and synthetic data generation platform for developers', url: 'https://www.neosync.dev', siteName: 'Neosync', images: [ @@ -27,6 +31,9 @@ export const metadata: Metadata = { locale: 'en_US', type: 'website', }, + alternates: { + canonical: 'https://www.neosync.dev', + }, }; export default function RootLayout({ diff --git a/marketing/app/msa/page.tsx b/marketing/app/msa/page.tsx index 8336dc6625..7487b583f3 100644 --- a/marketing/app/msa/page.tsx +++ b/marketing/app/msa/page.tsx @@ -1,4 +1,29 @@ -'use client'; +import { Metadata } from 'next'; + +export const metadata: Metadata = { + metadataBase: new URL('https://assets.nucleuscloud.com/'), + title: 'Neosync | MSA', + description: 'Neosync master services agreement and terms for customers.', + openGraph: { + title: 'Neosync', + description: 'Neosync Master Services Agreement and terms for customers.', + url: 'https://www.neosync.dev', + siteName: 'Neosync', + images: [ + { + url: '/neosync/marketingsite/mainOGHero.svg', + width: 1200, + height: 630, + alt: 'mainOG', + }, + ], + locale: 'en_US', + type: 'website', + }, + alternates: { + canonical: 'https://www.neosync.dev/msa', + }, +}; export default function MSA() { return ( diff --git a/marketing/app/page.tsx b/marketing/app/page.tsx index 208de8e5de..b54a34c94a 100644 --- a/marketing/app/page.tsx +++ b/marketing/app/page.tsx @@ -11,10 +11,13 @@ import { ReactElement } from 'react'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), - title: 'Neosync | Synthetic Data Orchestration', + title: 'Neosync | Open Source Data Anonymization and Synthetic Data', + description: + 'Neosync is an open source data anonymization and synthetic data generation platform for developers', openGraph: { - title: 'Neosync', - description: 'Open Source Synthetic Data Orchestration', + title: 'Neosync | Open Source Data Anonymization and Synthetic Data', + description: + 'Neosync is an open source data anonymization and synthetic data generation platform for developers', url: 'https://www.neosync.dev', siteName: 'Neosync', images: [ diff --git a/marketing/app/pricing/page.tsx b/marketing/app/pricing/page.tsx index 02076f71ba..77d0f0e04d 100644 --- a/marketing/app/pricing/page.tsx +++ b/marketing/app/pricing/page.tsx @@ -11,30 +11,59 @@ import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; import { Separator } from '@/components/ui/separator'; import { CheckCircle2Icon } from 'lucide-react'; +import Head from 'next/head'; import Link from 'next/link'; import posthog from 'posthog-js'; import { ReactElement } from 'react'; export default function Pricing() { return ( -
-
- Simple, Transparent Pricing -
-
- Pricing shouldn't be complicated, so we made it easy. -
-
- - - - -
- -
- + <> + + Neosync | Simple, transparent pricing + + + + + + + + + + + + +
+

+ Simple, Transparent Pricing +

+
+ Pricing shouldn't be complicated, so we made it easy. +
+
+ + + + +
+ +
+ +
-
+ ); } @@ -215,7 +244,7 @@ function CustomPlan(): ReactElement { ]; return ( -
+
diff --git a/marketing/app/privacy-policy/page.tsx b/marketing/app/privacy-policy/page.tsx index 9bb42e1fef..a4420161e1 100644 --- a/marketing/app/privacy-policy/page.tsx +++ b/marketing/app/privacy-policy/page.tsx @@ -1,4 +1,29 @@ -'use client'; +import { Metadata } from 'next'; + +export const metadata: Metadata = { + metadataBase: new URL('https://assets.nucleuscloud.com/'), + title: 'Neosync | Privacy Policy', + description: 'Neosync Privacy Policy and terms for customers.', + openGraph: { + title: 'Neosync', + description: 'Neosync Privacy Policy and terms for customers.', + url: 'https://www.neosync.dev', + siteName: 'Neosync', + images: [ + { + url: '/neosync/marketingsite/mainOGHero.svg', + width: 1200, + height: 630, + alt: 'mainOG', + }, + ], + locale: 'en_US', + type: 'website', + }, + alternates: { + canonical: 'https://www.neosync.dev/privacy-policy', + }, +}; export default function TOS() { return ( diff --git a/marketing/app/sitemap.ts b/marketing/app/sitemap.ts index c002e4f54f..4aaebdcfb2 100644 --- a/marketing/app/sitemap.ts +++ b/marketing/app/sitemap.ts @@ -21,7 +21,6 @@ export default async function sitemap() { const routes = [ '', '/about', - '/docs', '/blog', '/solutions/fix-staging-environments', '/solutions/reproduce-prod-bugs-locally', diff --git a/marketing/app/solutions/fix-staging-environments/hero.tsx b/marketing/app/solutions/fix-staging-environments/hero.tsx index 50ba65908d..d49b04ae6e 100644 --- a/marketing/app/solutions/fix-staging-environments/hero.tsx +++ b/marketing/app/solutions/fix-staging-environments/hero.tsx @@ -9,9 +9,9 @@ export default function Hero(): ReactElement { return (
-
+

Fix Broken Staging Environments -

+

Catch production bugs before they reach production and ship faster when you hydrate your staging environments with synthetic diff --git a/marketing/app/solutions/fix-staging-environments/page.tsx b/marketing/app/solutions/fix-staging-environments/page.tsx index ef1135b2f0..4e277d39f8 100644 --- a/marketing/app/solutions/fix-staging-environments/page.tsx +++ b/marketing/app/solutions/fix-staging-environments/page.tsx @@ -9,6 +9,8 @@ import Hero from './hero'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), title: 'Fix Staging Environments | Neosync', + description: + 'Fix broken staging environments using Neosync in order to deliver a better developer experience.', openGraph: { title: 'Fix Staging Environments | Neosync', description: diff --git a/marketing/app/solutions/reproduce-prod-bugs-locally/hero.tsx b/marketing/app/solutions/reproduce-prod-bugs-locally/hero.tsx index 0bee76bc33..64ad694af9 100644 --- a/marketing/app/solutions/reproduce-prod-bugs-locally/hero.tsx +++ b/marketing/app/solutions/reproduce-prod-bugs-locally/hero.tsx @@ -9,9 +9,9 @@ export default function Hero(): ReactElement { return (
-
+

Easily reproduce Production bugs locally -

+

Anonymize senstiive Production Data to make it safe to use locally for the best debugging experience diff --git a/marketing/app/solutions/reproduce-prod-bugs-locally/page.tsx b/marketing/app/solutions/reproduce-prod-bugs-locally/page.tsx index 5df2cee611..06f045262a 100644 --- a/marketing/app/solutions/reproduce-prod-bugs-locally/page.tsx +++ b/marketing/app/solutions/reproduce-prod-bugs-locally/page.tsx @@ -10,6 +10,8 @@ import ReproduceBugsLocally from './ReproduceBugsLocally'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), title: 'Easily reproduce Production bugs locally | Neosync', + description: + 'Easily reproduce Production bugs locally using Neosync in order to deliver a better developer experience.', openGraph: { title: 'Easily reproduce Production bugs locally | Neosync', description: diff --git a/marketing/app/solutions/security-privacy/hero.tsx b/marketing/app/solutions/security-privacy/hero.tsx index f8d268572e..803ff6e7a1 100644 --- a/marketing/app/solutions/security-privacy/hero.tsx +++ b/marketing/app/solutions/security-privacy/hero.tsx @@ -9,9 +9,9 @@ export default function Hero(): ReactElement { return (
-
+

Frictionless Security, Privacy and Compliance -

+

Easily comply with laws like HIPAA, GDPR, and DPDP with de-identified and synthetic data that structurally and statistically looks just like diff --git a/marketing/app/solutions/security-privacy/page.tsx b/marketing/app/solutions/security-privacy/page.tsx index a53b9e9242..66dfef923c 100644 --- a/marketing/app/solutions/security-privacy/page.tsx +++ b/marketing/app/solutions/security-privacy/page.tsx @@ -10,6 +10,8 @@ import Hero from './hero'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), title: 'Easily comply with Data Privacy, Security and Compliance | Neosync', + description: + 'Easily comply with laws like HIPAA, GDPR, and DPDP with de-identified and synthetic data that structurally and statistically looks just like your production data. ', openGraph: { title: 'Easily comply with Data Privacy, Security and Compliance | Neosync', description: diff --git a/marketing/app/solutions/unblock-local-development/hero.tsx b/marketing/app/solutions/unblock-local-development/hero.tsx index abd6b64478..a4c1274de0 100644 --- a/marketing/app/solutions/unblock-local-development/hero.tsx +++ b/marketing/app/solutions/unblock-local-development/hero.tsx @@ -9,9 +9,9 @@ export default function Hero(): ReactElement { return (
-
+

Safely test code against Production data -

+

Give developers the ability to self-serve de-identified and synthetic data for local development diff --git a/marketing/app/solutions/unblock-local-development/page.tsx b/marketing/app/solutions/unblock-local-development/page.tsx index e544474bc2..32b2fed472 100644 --- a/marketing/app/solutions/unblock-local-development/page.tsx +++ b/marketing/app/solutions/unblock-local-development/page.tsx @@ -10,6 +10,8 @@ import Hero from './hero'; export const metadata: Metadata = { metadataBase: new URL('https://assets.nucleuscloud.com/'), title: 'Unblock Local Development | Neosync', + description: + 'Unblock local development by using Neosync to anonymize sensitive data and generate synthetic data so that developers can self-serve data locally. ', openGraph: { title: 'Unblock Local Development | Neosync', description: diff --git a/marketing/app/terms-of-service/page.tsx b/marketing/app/terms-of-service/page.tsx index 33e18c2cbb..ad4682453e 100644 --- a/marketing/app/terms-of-service/page.tsx +++ b/marketing/app/terms-of-service/page.tsx @@ -1,4 +1,31 @@ -'use client'; +import { Metadata } from 'next'; + +export const metadata: Metadata = { + metadataBase: new URL('https://assets.nucleuscloud.com/'), + title: 'Neosync | Terms of Service', + description: + 'Neosync Terms of Service for customers and users of the platform.', + openGraph: { + title: 'Neosync', + description: + 'Neosync Terms of Service for customers and users of the platform.', + url: 'https://www.neosync.dev', + siteName: 'Neosync', + images: [ + { + url: '/neosync/marketingsite/mainOGHero.svg', + width: 1200, + height: 630, + alt: 'mainOG', + }, + ], + locale: 'en_US', + type: 'website', + }, + alternates: { + canonical: 'https://www.neosync.dev/terms-of-service', + }, +}; export default function TOS() { return ( diff --git a/marketing/components/landing-page/Hero.tsx b/marketing/components/landing-page/Hero.tsx index ea7a18020a..1d30b17ff9 100644 --- a/marketing/components/landing-page/Hero.tsx +++ b/marketing/components/landing-page/Hero.tsx @@ -28,10 +28,9 @@ export default function Hero(): ReactElement {
Backed by Y Combinator

-
- Open Source Data Anonymization and Synthetic Data Generation For - Developers -
+

+ Open Source Data Anonymization and Synthetic Data For Developers +

Anonymize PII, generate synthetic data and sync environments for better testing, debugging and developer experience. diff --git a/marketing/content/blog/3-types-of-deployments.mdx b/marketing/content/blog/3-types-of-deployments.mdx index fe80591a69..761a788f1d 100644 --- a/marketing/content/blog/3-types-of-deployments.mdx +++ b/marketing/content/blog/3-types-of-deployments.mdx @@ -1,6 +1,6 @@ --- title: 3 types of Zero-Downtime Deployments in Kubernetes -description: A guide to the 3 types of zero-downtime deployments in Kubernetes +description: A guide to the 3 types of zero-downtime deployments in Kubernetes - Rolling Deployments, Blue-green deloyments, Canary deployments. image: https://assets.nucleuscloud.com/neosync/blog/bluegreen.png date: 2023-11-29 published: true @@ -10,7 +10,7 @@ authors: import InlineBlogCode from '../../components/codeblocks/InlineBlogCode.tsx'; -# Intro +## Intro One of the main benefits of using a container orchestration system like Kubernetes is that you have a lot of control over your deployment strategy and how you deploy software. In a traditional software environment, deployments or upgrades to applications result in downtime and disruption of service which can frustrate users. In today's cloud native world, users expect that software should be available at all times and in all time zones. @@ -20,7 +20,7 @@ In this blog, we'll walk through three deployment strategies that you can implem Let's jump in. -# 3 types of zero-downtime deployments +## 3 types of zero-downtime deployments The three most common types of zero-downtime deployment are: @@ -30,7 +30,7 @@ The three most common types of zero-downtime deployment are: Lets take a look at all three of these zero-downtime deployment approaches in more detail. -## Rolling deployments +### Rolling deployments ![rolling](https://assets.nucleuscloud.com/neosync/blog/rollingdeploy.png) @@ -98,7 +98,7 @@ There are two optional parameters that you can define to have more control over The primary benefit of rolling deployments is that if there is an issue during the deployment process, it only affects some of the pods and not all of them and the end user typically doesn't see the impact. -## Blue-green deployments +### Blue-green deployments ![bluegreen](https://assets.nucleuscloud.com/neosync/blog/bluegreen.png) @@ -117,7 +117,7 @@ Blue-green deploys are great because there is minimal downtime and an easy way t Lastly, while Kubernetes doesn't support blue-green deployments natively (it's default is rolling as discussed above), there are CI/CD platforms such as ArgoCD and Jenkins which can help accomplish this. -## Canary deployments +### Canary deployments ![canary](https://assets.nucleuscloud.com/neosync/blog/canary.png) @@ -131,6 +131,6 @@ Similar to Blue-green deploys, you'll need to use an additional tool to be able Canary deployments are great for early issue detection. Since only a segment of users or servers are exposed to the issue, you can quickly resolve it without jeopardizing the broad user base. Additionally, canary deployments provide an ideal environment for controlled testing and validation. Because you choose the segment, you can experiment with different scenarios to help account for various real-world situations the software may encounter. This also carries some risk, as you may test the software with a non-representative sample. -# Wrapping up +## Wrapping up If you're building applications and services on Kubernetes and thinking about how to achieve zero down-time deploys, then these three options are definitely ones you should consider. Depending on which route you want to go, you may have to use additional tools to achieve those goals but at the end of the day, if you're able to deliver a consistent and stable application to your end users, then it's worth it! diff --git a/marketing/content/blog/ai-driven-synthetic-data-generation.mdx b/marketing/content/blog/ai-driven-synthetic-data-generation.mdx index a037cb490d..cf382c5d49 100644 --- a/marketing/content/blog/ai-driven-synthetic-data-generation.mdx +++ b/marketing/content/blog/ai-driven-synthetic-data-generation.mdx @@ -1,6 +1,6 @@ --- title: Generating Synthetic Data with LLMs -description: A guide to using AI to generate synthetic data +description: A guide to using AI to generate synthetic data for your database and application using any LLM that is available at an endpoint. image: /images/blogs/llmheader.png date: 2024-05-21 published: true @@ -8,7 +8,7 @@ authors: - evis --- -# Intro +## Intro We recently published a blog on how to use [Neosync and Open AI to generate synthetic data](/blog/neosync-llm-integration) at the row level. That was just v0 of our AI features. We're excited to launch v1 of using LLMs to generate synthetic data on Neosync! @@ -16,7 +16,7 @@ Now you can use any model that is hosted on an endpoint to be able to generate s Let's go through a walk-through example and guide. -## Pre-requisites +### Pre-requisites In order to get started, you'll need the following: @@ -24,7 +24,7 @@ In order to get started, you'll need the following: 2. Access to an LLM hosted at an endpoint. I'll be using OpenAI but you can use any other LLM as long as it's available at a REST endpoint. 3. A database to store your data. I'm going to use Neon but either Postgres or Mysql works. -# Setting up our database +## Setting up our database For this guide, I'm going to use a Postgres database from [Neon](https://neon.tech). @@ -54,7 +54,7 @@ Last thing we'll need to do is to just get the connection URL that we'll use to Hang on to this for a minute and we can move onto setting up Neosync. -# Creating a Connection +## Creating a Connection Now that we have our database set up, we can connect Neosync to it. @@ -66,7 +66,7 @@ Let's name our connection `cust-gen-db` and then in the **Connection URL** field Nice! Looks like we can see our database and table and that our permissions are correct. Click on **Close** and then **Submit** to save the connection. -# Creating a Connection to our LLM +## Creating a Connection to our LLM Next, we need to create a connection to our LLM provider. click on **Connections** -> **+ New Connection** -> **OpenAI**. We support any LLM that uses the OpenAI interface. @@ -76,7 +76,7 @@ Fill in you Connection Name, model endpoint and then the API key. Click on **Submit** to save the connection. -# Creating a Job +## Creating a Job Jobs are how we configure and execute workflows to run and generate data or anonymize existing data. Click on **Jobs** in the top navigation menu and then click on the **+ New Job** button. Select the **AI Data Generation** job type and click **Next**. @@ -120,7 +120,7 @@ Lastly, set the total number of records that you want. We support up to 1000 rec And we're done creating the job! -# Checking the output +## Checking the output Now is the fun part. We can open up our database and see what our data looks like. @@ -138,6 +138,6 @@ I've been using the skin care app for a month now, and I must say I'm impressed. This looks pretty realistic!. Also, we can just do a quick scan and see that the interviews are all different and not a copy and paste which is great. -# Conclusion +## Conclusion There is a lot more that we can do here and we will continue to expand on this as we build out more features and support. Let us know if you have any feedback! diff --git a/marketing/content/blog/github-oss-metrics.mdx b/marketing/content/blog/github-oss-metrics.mdx index 5c314b04fa..24d7b515ac 100644 --- a/marketing/content/blog/github-oss-metrics.mdx +++ b/marketing/content/blog/github-oss-metrics.mdx @@ -1,6 +1,6 @@ --- title: The Official Guide to Github Metrics for Open Source Projects -description: If you're building an open source project, then you need to be looking at these Github metrics. +description: If you're building an open source project, then you need to be looking at these Github metrics. From Stars to Community insights, these are a game changer. date: 2024-02-15 published: true image: https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/github-guide-oss.png @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction If you're working on an open source project, then you know that usage metrics can be hard to come by. With a hosted or private platform, it's easy to implement product metrics from Posthog or Mixpanel and fire off events every time a user does something. You can use these metrics to better understand user workflows and actions, usage, friction points and the overall customer journey. With an open source project, you don't get the same luxury. Even if you implement product tracking and eventing in your application, you typically have to provide customers with a way to turn that off. And most do. So then how do you understand who is using your product? And where to invest your time? @@ -16,74 +16,74 @@ We wrestled with this question for a while when we started working on [Neosync]( Below we're going to outline all of the metrics you can pull from Github for your open source project and how to interpret them. -# The Basics +## The Basics ![basics](https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/repo-basic.png) First, let's cover the basic metrics that are easy to see on every single repo since they're presented front and center. Here is a snapshot of the current Neosync repo that you find on the right hand side of the repo's main page. -## Github Stars +### Github Stars The first and most obvious metric is Github Stars. These are equivalent to "likes" on a social media platform. It's a signal of whether people are interested in your project. I don't think this is the strongest signal you can have but it's a signal nonetheless. -## Watchers +### Watchers Watchers are Github users who have signed up to get alerts about new discussions, pull requests and issues for your repo. Watchers can be found right underneath Stars. It's a signal that someone wants to stay up to date on your repo. -## Forks +### Forks Forks occur when a Github user clicks on the "Fork" button on the repo. This effectively clones your repo into a new repo in their Github account. **Note:** A Fork is different than a clone. We'll touch on that soon. I think forks are a stronger signal than Watchers and Stars because someone is saying "Yes, I want to clone what you have and use it for myself." It doesn't always work out this way, someone can fork the repo and never touch it, but it's a mentally more expensive action than a Star so I think it deserves a bit more weight than a star. -## Contributors +### Contributors ![contribs](https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/contribs.png) Contributors are Github users who have contributed in some way to your repo. This is typically in the form of a pull request but it doesn't always have to be code. It can be documentation, issues discussions etc. -# More Interesting Metrics +## More Interesting Metrics Now we can get onto the more interesting metrics. -## Package Downloads +### Package Downloads ![package](https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/repo-packages.png) If you're publishing packages on Github, you can click on the **Packages** link on the right hand side of the repo and see every package you publish. To the right of the Package Name, you'll see how many times that package has been downloaded. This is a good indicator of how often people are downloading your packages within their projects. -## Community Insights +### Community Insights ![community](https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/community.png) Github has a number of community insights that can give you a look into how your community is interacting with your repo from a contribution, discussion and activity perspective. To access the community insights, navigate to your repo and click on **Insights** -> **Community** This can be pretty useful to see if your community is consistently engaging with you and how they're spending their time. -# Traffic +## Traffic ![traffic](https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/traffic.png) In my opinion, this section is probably the most useful in growing your community. It's in the same section as the community insights. You can get there by navigating to your repo and clicking on **Insights** -> **Community** There are four parts to it. -## Git clones +### Git clones This graph gives you the total number of `git clone` for your repo as well as a 2 week rolling view. As we mentioned above, git clones differ from Forks in that git clones are just local clones, while forks set up a remote repo. If you hover over the graph. you can also see the breakdown between **clones** and **unique cloners**. This can give you an understanding of the number of unique users that are cloning your repo as well as, on average, how many times each user is cloning your repo by dividing clones/cloners. -## Visitors +### Visitors This graph is identical in layout to the git clones graph above but refers to visitors instead of clones. This is useful to understand the overall number of visitors to your repo and how often users are returning. -## Referring sites +### Referring sites Referring sites are useful to understand where your users are coming from. This list is also on a 14 day window. If you're doing any sort of SEO or even direct advertising, you can see which channels are driving the most traffic to your repo. -## Popular content +### Popular content Popular content tells you which pages in your repo are getting the most views and visitors. Typically your repo homepage will be in the lead here but it's also helpful to see if users are reading any particular file or your roadmap or your issues. This can help you focus your time on improving those experiences. -## Forks +### Forks ![traffic](https://assets.nucleuscloud.com/neosync/blog/github-oss-metrics/forks.png) The forks section in the **Insights** page tells you exactly who forked your repo. This is helpful because, with a little bit of Linkedin magic, you can try and see which organizations are using your project. The forks page also gives you some stats on when that user forked the project and the last time they updated it. This can help you understand how frequently they're using the project. There are also filters for the time period, repository type and a way to sort those forks so you can slice and dice the data however you'd like. -# Wrapping up +## Wrapping up One of the challenges of running an open source project is getting insights into where your users and visitors are coming from and what they're doing in your repo. Luckily, Github can help answer some of these questions with the metrics we went over above. There are a few other sections of the **Insights** page that we didn't cover such as the Code frequency, Dependency graph, Network, Commits, etc. In my opinion, these are less interesting than the metrics above at helping you understand who is visiting your repo but they can be interesting nonetheless. Of course, this is just one piece of the puzzle and you should consider product metrics as well to help you understand what users are doing in your application. diff --git a/marketing/content/blog/how-does-pg-dump-work.mdx b/marketing/content/blog/how-does-pg-dump-work.mdx index 2cdde68a95..40d5ffcbdc 100644 --- a/marketing/content/blog/how-does-pg-dump-work.mdx +++ b/marketing/content/blog/how-does-pg-dump-work.mdx @@ -1,6 +1,6 @@ --- title: A technical deep dive into how PGDUMP works -description: Learn how PGDUMP works under the covers +description: Learn how PGDUMP works under the covers and how you can use it for data migrations, backups and more. date: 2024-03-11 published: true image: /images/blogs/pgdumptech.svg @@ -8,13 +8,13 @@ authors: - evis --- -# Introduction +## Introduction PGDUMP (or pg_dump) is a [Postgres utility](https://www.postgresql.org/docs/current/app-pgdump.html) [(source)](https://github.com/postgres/postgres/blob/master/src/bin/pg_dump/pg_dump.c) that comes bundled with every PostgreSQL installation. It is a command-line tool that connects to a PostgreSQL server and constructs queries to extract database metadata and/or table data, including all of the constraints. This exported file can then be used to recreate the database on the same or another PostgreSQL server. This is really useful for backups and data migrations. Let's take a step-by-step look at exactly how PGDUMP works under the hood and then at the end go through an example. -# Step 1 - Connect and Query +## Step 1 - Connect and Query When you run the pg_dump command, the first thing it does is establish a connection to the PostgreSQL server using the provided connection string. It uses the same libpq library as the psql command-line client. This is nice because it ensures compatability across versions. @@ -38,7 +38,7 @@ Let's go through these in more detail. It's important to note that pg_dump doesn't query the tables directly. Instead, it queries the system catalogues which contain all of the metadata representing the database. The order also matters. For example, data types are queried before tables since tables may use custom types. It then buffers this metadata in memory as it gets ready for step 2. -# Step 2: Dump the Data +## Step 2: Dump the Data Now that we have the metadata, pg_dump then starts the process of extracting the raw data for each table in the database. There are configurations for pg_dump where you only get the schema. If so, then this step is effectively skipped. If we want the data as well, then here's a breakdown of what happens: @@ -51,7 +51,7 @@ Tables with foreign keys or inheritance hierarchies are dumped in an order that At this point, pg_dump has, in memory, the contents of the pg_dump output file. Just in time for step 3! -# Step 3: Assembling the Dump File +## Step 3: Assembling the Dump File As pg_dump fetches metadata definitions and table data, it incrementally constructs the dump file by writing out sections in this order: @@ -65,13 +65,13 @@ As pg_dump fetches metadata definitions and table data, it incrementally constru The output is written as plain text with SQL statements intermixed with PGDUMP directives that indicate which database objects a section applies to. -# Step 4: Output +## Step 4: Output By default, pg_dump outputs the complete SQL script to standard output (stdout). You can optionally specify an output file to write to instead. Additionally, pg_dump can also compress the output SQL file using the specified compression format (gzip, bzip2, etc). The compressed file can then be transferred and decompressed on the destination system. -# Let's look at an example +## Let's look at an example Let's go through an example of using pg_dump to back up the "neosync_test" sample database: @@ -174,6 +174,6 @@ The dump contains the DDL to create all schemas, tables, data types, and other o When this dump file is run through psql on another PostgreSQL server, it will accurately reconstruct the complete "neosync_test" database from scratch. -# Wrapping up +## Wrapping up Pg_dump is used by thousands of companies every day to back up their databases. Conceptually, it's pretty simple but the implementation is quite complex. The source code is about 18k lines long! It's fun to take a deep dive under the covers and see exactly what's going on. I hope this technical deep dive should give you a solid understanding of the detailed process pg_dump goes through. diff --git a/marketing/content/blog/introducing-custom-transformers.mdx b/marketing/content/blog/introducing-custom-transformers.mdx index 708644ec7b..c303700c30 100644 --- a/marketing/content/blog/introducing-custom-transformers.mdx +++ b/marketing/content/blog/introducing-custom-transformers.mdx @@ -1,6 +1,6 @@ --- title: Introducing Custom Code Transformers -description: We're excited to give developers the ability to have full control over how they transform their data. +description: Use Custom Code Transformers to create your own data transformations using javascript. Implement custom logic, make API calls and more! date: 2024-01-09 published: true image: https://assets.nucleuscloud.com/neosync/blog/codeNew.png @@ -8,11 +8,11 @@ authors: - evis --- -# Introduction +## Introduction Developers love flexibility. They want to be able to do things their own way and transforming data is no different. Today I'm excited to give developers that flexibility in [Neosync](https://githunb.com/nucleuscloud/neosync) with Custom Code Transformers. -# What is a Transformer? +## What is a Transformer? A Transformer is a data-type specific module that transforms data in some way. Transformers can be deterministic or probabilistic and can anonymize data and/or generate synthetic data. Neosync ships with 40+ transformers out of the box. @@ -20,7 +20,7 @@ A Transformer is a data-type specific module that transforms data in some way. T For example, a `generate_float64` transformer, can generate random float64 values. Or the `generate_email` transformer can generate realistic-looking email addresses that are privacy-safe. The standard transformers that Neosync ships with are useful for some use-cases but how about when you want to transform the data in a custom way? This is where Custom Code Transformers come into play. -# Creating a Custom Code Transformer +## Creating a Custom Code Transformer Custom Code Transformers are exactly what they sound like. Developers can write their own custom code to transform the data however they'd like. Let's take a look at an example. @@ -58,7 +58,7 @@ Once you've validated your code, you can click **Save** to save your transformer ![tj](https://assets.nucleuscloud.com/neosync/blog/saved-transformer.png) -# Using the Custom Code Transformer +## Using the Custom Code Transformer Now that we've created our Custom Code Transformer, we can create a job to pull data from our source database, anonymize it and then sync it to our prod database. First, let's check our source database and look at a single row. @@ -90,7 +90,7 @@ Lastly, we could configure Subset filters here. These filters allow you to flexi We can see that our input age from our source database table had an age of 20 and now after passing it through our custom age transformer, it's bucketed into an age range of 20-24. Also, notice the other columns have changed as well as we generated synthetic data for those columns. -# Wrapping up +## Wrapping up We've walked through how to create a Custom Code Transformer and use in it a job to anonymize age data. This is a simple example but you can make this can take this as s blueprint for a more complicated example with multiple tables, millions of rows and more. diff --git a/marketing/content/blog/introducing-neosync.mdx b/marketing/content/blog/introducing-neosync.mdx index 2d3f95e0eb..f6172c0b01 100644 --- a/marketing/content/blog/introducing-neosync.mdx +++ b/marketing/content/blog/introducing-neosync.mdx @@ -1,6 +1,6 @@ --- title: Introducing Neosync -description: Open source Data Replication and Anonymization for Relational and Tabular Data +description: Open source data anonymization and synthetic data orchestration for a better developer experience and data security/privacy/compliance date: 2023-12-20 published: true image: https://assets.nucleuscloud.com/neosync/blog/neosync-header.png @@ -10,7 +10,7 @@ authors: import InlineBlogCode from '../../components/codeblocks/InlineBlogCode.tsx'; -# Introduction +## Introduction When Nick and I started Nucleus Cloud Corp, our vision was to help developers build faster, more secure and resilient applications without having to be experts in security, data privacy, infrastructure, observability, the list goes on. Today, developers have to think about so many different things in order to develop and deploy a production-ready application that it's a miracle anything gets done. We built the Nucleus Cloud Platform to solve these problems and get developers back to what they love doing, writing code. We spent a year on the Nucleus Cloud Platform and were proud of the work that we had done. But at the end of the day, it was clear to us that cloud infrastructure is a tough game for startups. So we made the decision to work on other ideas. @@ -20,7 +20,7 @@ At the same time, something amazing was happening in the tech industry. AI/ML ha With this context, we started to think about the intersection of these two problems and came up with Neosync. -# What is the problem? +## What is the problem? Today, developers and ML engineers don't have a way to generate high-quality synthetic data and sync it across environments. Neosync aims to solve this problem. @@ -33,13 +33,13 @@ In the near future, we're going to be releasing models that allow you to define The second thing Neosync does is that it syncs data across environments. If you're anonymizing data from prod and want to use that locally, you need to first subset the data, because you likely only need a portion of it and then secondly sync it across stage, CI and dev. Neosync natively handles all of this with a workflow orchestration framework powered by [Temporal](https://www.temporal.io). You can even sync it directly to a local DB using the Neosync CLI. For ML engineers, this can mean syncing data across environments from an S3 bucket or data lake. We made it dead simple and put it on a schedule. -# Why now? +## Why now? It's clear that AI/ML is a major platform change and the best businesses are built ontop of platform changes. There is significant demand for high quality data for building applications and training models and we want Neosync to be at the forefront of that category. At the same time, data privacy and security have become more important than ever. The number and severity of data breaches continues to increase as hackers become more sophisticated with deep fake technology. This is why securing and protecting sensitive data is so important. All of this together makes now the perfect time for Neosync to enter the world. As more developers and companies start to train and fine tune their own models, they're going to need a way to easily generate synthetic data. Especially, as data privacy laws become more stringent around the word, this need only increases. -# Why Open Source? +## Why Open Source? There are two main reasons we think open source is the right model for Neosync. The first is that for most companies, they don't want their data to leave their infrastructure. Especially if that data is sensitive. When it's inside of your infrastructure, it's a lot easier to control access to the data and audit who interacted with it. Once that data leaves your infrastructure, it's anybody's game. Open source is a great way to get adoption from mid-size and enterprise sized companies who have long procurement cycles and stringent data privacy and security programs. A developer can fork your repo and run it locally in an hour if your project is open source versus if it's not, well, then you're in procurement hell for the next 6-9 months. @@ -47,11 +47,11 @@ The second reason is that we fundamentally believe that when it comes to data, o And our promise is that we will **always** have an open source version of Neosync that you can use. That will never change. -# What is our vision? +## What is our vision? Our vision is that high-quality data is accessible to every developer and ML engineer. Whether you're building an application or training a model, you should have access to the data you need in the format and shape you need it. Data should no longer be the bottleneck to great applications and models. -# What's next? +## What's next? Build and evangelize. It's really that simple. We're going to be building a lot and talking about it a lot. There is a huge roadmap that we're really excited about and we can't wait to see teams using Neosync. Whether you're a developer building an application and you need high-quality synthetic data to test your application or an ML engineer fine-tuning the latest OSS model, we're building Neosync into something that can serve you. If you're interested in what we're working on and want to stay up to date or (even better) contribute, check out our [github repo.](https://github.com/nucleuscloud/neosync) diff --git a/marketing/content/blog/log-me-out.mdx b/marketing/content/blog/log-me-out.mdx index 114983ee17..b95cb0e4aa 100644 --- a/marketing/content/blog/log-me-out.mdx +++ b/marketing/content/blog/log-me-out.mdx @@ -1,6 +1,6 @@ --- title: Log Me Out -description: Details the difficulties of logging a user out of an application, and why it's harder than one might think. +description: Details the difficulties of logging a user out of an application, and why it's harder than one might think. This include a guide and code samples. date: 2024-02-03 published: true image: /images/blogs/logout.png diff --git a/marketing/content/blog/neosync-benchmarks.mdx b/marketing/content/blog/neosync-benchmarks.mdx index af99d60841..a5490f5d38 100644 --- a/marketing/content/blog/neosync-benchmarks.mdx +++ b/marketing/content/blog/neosync-benchmarks.mdx @@ -1,6 +1,6 @@ --- title: How we reduced our data generation times by 50% -description: A walkthrough of how we reduced the time it takes to generate data in Neosync by 50% + benchmarks. +description: A walkthrough of how we reduced the time it takes to generate data in Neosync by 50%. Also see our updated performance benchmarks. date: 2024-04-10 published: true image: /images/blogs/datagentime.svg @@ -8,13 +8,13 @@ authors: - evis --- -# Introduction +## Introduction Data generation is at the heart of what we do at [Neosync](https://www.neosync.dev) so it's no surprise that we're constantly trying to optimize our performance. We recently spent a few days digging into how we do our batching and connection pooling when we connect to a database and were able to further optimize our system and reduce the time it takes to generate data by 50%!. Let's jump in. -# Initial benchmarks +## Initial benchmarks To start, let's look at what we were benchmarking before we optimized our data pipelines across both job types. @@ -41,13 +41,13 @@ Up until 10,000 rows, the times are very close to each other regardless of the n Interestingly, we don't see as much separation between the 14 and 21 column runs as we do for the 7 and 14 column runs. -# Optimizations +## Optimizations Optimizing code can be really fun but you can also get trapped into hyper-optimizations. We tried really hard to narrow down our optimizations to only the most impactful ones and said early on that we only wanted to implement one or two in this round. Luckily, we had talked about doing these optimizations for a few weeks so they were top of mind. -## Batching inserts +### Batching inserts One of the first things we built when we started building Neosync was our data insertion module. It basically handled all of the data insertion logic when we insert data into one or multiple destinations. @@ -59,7 +59,7 @@ With some work, we were able to introduce batching into this process, where we c This really helped to speed things up. We also moved away from creating our own `INSERT` statements in raw SQL to using [goqu](https://github.com/doug-martin/goqu) as a SQL builder. It's fast and flexible and really helped to streamline our data insertion process. -## Connection limits +### Connection limits Our first implementation of Neosync didn't optimize how many connections we were opening to the database which caused intermittent connection errors if we were opening too many without closing existing ones. @@ -69,13 +69,13 @@ This would occasionally throw an error saying that have reached the connection l After some work to implement connection limits, we saw a massive drop off in connection errors and much faster data generation times. So now for each database integration, we expose a max connection limit, with the default of 80, to ensure that we don't (or rarely) hit any connection limit errors. By preventing this error, we don't have to wait and retry to open another connection which saves a lot of time. -# Post optimization benchmarks +## Post optimization benchmarks Once we implemented our optimizations, we saw a pretty big reduction in the time it takes to run our data generation and sync jobs. Let's take a look. -## Data generation +### Data generation Here are the new benchmarks for data generation jobs across the same dimensions as before. @@ -94,7 +94,7 @@ The structure of our graph stayed pretty much the same except the difference in The 21 column, 1,000,000 row job now only takes 4 minutes and 4 seconds instead of 7 minutes and 51 seconds before. That's pretty cool! -# Conclusion +## Conclusion We've been making a lot of optimizations to speed up the time it takes for us to generate, transform and sync records across databases. Anytime you can reduce your processing time by 50% you take that as a win. But there are still more things that we can do to optimize our pipelines even further. Particularly around our transformers and how we generate and anonymize data. Those will be coming soon enough and we'll publish another blog post once those improvements are out. diff --git a/marketing/content/blog/neosync-llm-integration.mdx b/marketing/content/blog/neosync-llm-integration.mdx index 7f21a562a1..f4351c4394 100644 --- a/marketing/content/blog/neosync-llm-integration.mdx +++ b/marketing/content/blog/neosync-llm-integration.mdx @@ -1,6 +1,6 @@ --- title: Using Neosync and OpenAI to generate synthetic data -description: A walkthrough tutorial on how to integrate Neosync with OpenAI to generate synthetic +description: A walkthrough tutorial on how to integrate Neosync with OpenAI to generate synthetic data at the row level including code samples! date: 2024-04-04 published: true image: /images/blogs/openai-header.png @@ -8,21 +8,21 @@ authors: - evis --- -# Introduction +## Introduction We recently helped host a [hackathon at the Github HQ](https://www.linkedin.com/feed/update/urn:li:activity:7178809047574351873) and one of the projects that we enjoyed the most was a hacker integrating Neosync with OpenAI in order to generate synthetic data. As soon as we saw it, a hundred different use-cases came to mind of how this could be useful. So I wanted to write a blog and a guide on exactly how you can integrate Neosync with not just OpenAI but any model that is available at some endpoint. We're going to use the use case that we have a customer interview platform that we want to generate synthetic data for our staging and development environments. Because the production customer interview data is sensitive, generating synthetic data is a secure and privacy-safe way of giving developers the data they need to build new features without compromising on data privacy. -# Pre-requisites +## Pre-requisites In order to get started, you'll need the following: -1. A Neosync account which you can sign up for free [here]("https://www.neosync.dev") +1. A Neosync account which you can sign up for free [here](https://www.neosync.dev) 2. Access to an LLM hosted at an endpoint. I'll be using OpenAI but you can use any other LLM as long as it's available at a REST endpoint. 3. A database to store your data. I'm going to use Neon but either Postgres or Mysql works. -# Setting up our database +## Setting up our database For this guide, I'm going to use a Postgres database from [Neon](https://neon.tech). @@ -52,7 +52,7 @@ Last thing we'll need to do is to just get the connection URL that we'll use to Hang on to this for a minute and we can move onto setting up Neosync. -# Creating a Connection +## Creating a Connection Now that we have our database set up, we can connect Neosync to it. @@ -64,7 +64,7 @@ Let's name our connection `cust-gen-db` and then in the **Connection URL** field Nice! Looks like we can see our database and table and that our permissions are correct. Click on **Close** and then **Submit** to save the connection. -# Creating a Custom Transformer +## Creating a Custom Transformer Neosync has the notion of Transformers. Transformers are modules that can generate synthetic data or anonymize existing data. We ship Neosync with 40+ transformers out of the box but you can also create your own [custom tranformers](/blog/introducing-custom-transformers) in code which is what we're going to do for this use case. @@ -116,7 +116,7 @@ Note that if you decide to use a different endpoint, you just have to wrap it in Next, click on **Submit** and let's move onto the final step. -# Creating a Job +## Creating a Job Jobs are how we configure and execute workflows to run and generate data or anonymize existing data. Click on **Jobs** in the top navigation menu and then click on the **+ New Job** button. Select the **Data Generation** job type and click **Next**. @@ -162,7 +162,7 @@ And after a minute or so, the job should complete like this: ![new-trans](/images/blogs/cust-job-done.png) -# Checking the output +## Checking the output Now is the fun part. We can open up our database and see what our data looks like. I'm using DBeaver as a database client to connect to my Neon database but you can also just use the SQL Editor in Neon to query your database. @@ -204,6 +204,6 @@ Interviewer: That's impressive. Thank you for sharing your insights with us toda This looks pretty realistic!. Also, we can just do a quick scan and see that the interviews are all different and not a copy and paste which is great. -# Conclusion +## Conclusion This is just a peek into what you can do with Neosync and LLMs. There is a lot more coming on this that we're working on. As I mentioned earlier, you can think of applying this to so many other use cases and data types. If you want to use LLMs to generate some synthetic data, consider checking out Neosync. diff --git a/marketing/content/blog/neosync-migration.mdx b/marketing/content/blog/neosync-migration.mdx index 03a429dbda..8f5b63eff6 100644 --- a/marketing/content/blog/neosync-migration.mdx +++ b/marketing/content/blog/neosync-migration.mdx @@ -1,6 +1,6 @@ --- title: PGDUMP vs Neosync to Migrate Data across Postgres Databases -description: A comparison between pg_dump and Neosync for Postgres data migrations +description: An in-depth comparison between pg_dump and Neosync for migrating data between Postgres databases, including pros and cons. date: 2024-04-08 published: true image: /images/blogs/pgdump-migrations.svg @@ -8,39 +8,39 @@ authors: - evis --- -# Introduction +## Introduction We've been seeing more and more customers use Neosync for data migrations across Postgres databases and I wanted to write up a quick blog on how others can do this and why using Neosync to do data migrations may make more sense than using something like `pg_dump`. -# Can't I just use pg_dump? +## Can't I just use pg_dump? `pg_dump` is great tool and for a lot of use-cases, `pg_dump` your database and restoring it using `pg_restore` works just fine. But there are a few use-cases where it's not the best idea. Let's cover some of those now. -## Large databases +### Large databases For large databases, `pg_dump` and `pg_restore` can be slow, especially over network connections depending on where you're outputting the `pg_dump` file. You need enough disk space to store the dump file(s). Neosync streams the data from the source to the destination so the size of the database isn't a constraint. -## Version Compatibility +### Version Compatibility `pg_dump` supports dumping data frm older versions of Postgres to newer versions since Postgres is backwards compatible but you can't do the reverse. You might ask, why would I downgrade from a newer version to an older version? This happens more than you think. Whether it's a bug or a business specific reason, you may just have to rollback to an older version. In that case, if your versions mismatch, you might run into a problem. -## Partial Migrations +### Partial Migrations Probably the most common reason we see users using Neosync for data migrations is that `pg_dump` isn't very flexible when it comes to selecting schemas, tables and filters. Since Neosync can subset data using SQL queries, we're seeing a lot of customers use it to selectively move some data. You can learn more about this [here](/blog/subset-referential-integrity) -## Data Transformations +### Data Transformations Along with the partial migrations reason above, if you need to transform the data in any way before you insert it into the new destination, then you would need to do some pre- or post-processing of the data. Neosync makes this dead simple with [transformers](/blog/introducing-custom-transformers) -## Security and privacy +### Security and privacy When you `pg_dump` a database, you're exporting all of the data and metadata about that database. Which means that you now have a file with a lot of sensitive data laying around in an S3 bucket somewhere. Depending on your organization that might not be allowed especially depending on where the new database lives. If it's in another VPC, you'll need to a find a way to move that data from one VPC to another. This can bring up security and privacy questions. -# Using Neosync to Migrate Data +## Using Neosync to Migrate Data In order to get started, you'll need a Neosync account. You can sign up for a [free account here](https://www.neosync.dev). -## Creating your Connections +### Creating your Connections Once you're signed up, navigate to the **Connections** page on the top navigation bar and click on **+ New Connection**. @@ -54,7 +54,7 @@ Once that looks good, go ahead and repeat it for your destination database. Now you should have two connections, 1 for your source and another for your destination. -## Creating a Job +### Creating a Job The last step is to run a **Data Synchronization** to migrate the data. @@ -76,6 +76,6 @@ Lastly, is the **Subset** page. If you want to subset your date in any way, then That's it! Now the job will run and you'll be able to migrate data from one database to another. -# Conclusion +## Conclusion In this blog, we went over how to use Neosync to easily migrate data from one Postgres database to another. There are many ways that you can migrate data across databases but depending on your security, privacy and operational requirements, Neosync can be a great "easy button". diff --git a/marketing/content/blog/neosync-neon-data-gen-job.mdx b/marketing/content/blog/neosync-neon-data-gen-job.mdx index acdd772cb5..eb4cba8038 100644 --- a/marketing/content/blog/neosync-neon-data-gen-job.mdx +++ b/marketing/content/blog/neosync-neon-data-gen-job.mdx @@ -1,6 +1,6 @@ --- title: How to seed your Neon DB with Synthetic Data -description: A walkthrough tutorial on how to seed a Neon DB with Synthetic Data +description: A walkthrough tutorial on how to seed a Neon DB with Synthetic Data for a better developer experience while working with Neon. date: 2024-02-21 published: true image: /images/blogs/neon/neondatagen.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction [Neon DB](https://neon.tech/) is a fast-growing database that offer serverless Postgres. It's open source and gaining a lot of traction with individual developers working on side projects as well as businesses running mission critical applications. @@ -16,14 +16,14 @@ In this guide, we're going to walk through how you can seed your Neon database w Let's jump in. -# Prerequisites +## Prerequisites We're going to need a Neon account and a Neosync account. If you don't already have those, we can get those here: - [Sign up for Neon](https://console.neon/tech/login) - [Sign up for Neosync](https://www.neosync.dev) -# Setting up Neon +## Setting up Neon Now that we have our accounts, we can get this ball rolling. First, let's log into Neon. If you already have a Neon account then you can either create a new project or a new database. If you don't have a Neon account then give your project a name, your database a name and select a region like below: @@ -52,11 +52,11 @@ We can do a quick sanity check by going to **Tables** and seeing that our table Nice! Okay, last step for Neon. Let's get our connection string so we can connect to Neon from Neosync. We can find our connection string by going to **Dashboard** and then under the **Connection String** header, you can find your connection string. Hold onto this for a minute while we get Neosync set up. -# Setting up Neosync +## Setting up Neosync Now that we're in Neosync, we'll want to first create a connection to our Neon database and then create a job to generate data. Let's get started. -## Creating a Connection +### Creating a Connection Navigate over to Neosync and [login](https://app.neosync.dev). Once you're logged in, go to to **Connections** -> **New Connection** then click on **Postgres**. @@ -79,7 +79,7 @@ Once you've completed filling out the form, you can click on **Test Connection** Let's click **Submit** and move onto the last part. -## Creating a Job +### Creating a Job In order to generate data, we need to create a **Job** in Neosync. Let's click on **Job** and then click on **New Job**. We're now presented with two options: @@ -136,6 +136,6 @@ SELECT * FROM users; Looking pretty good! We have seeded our Neon database with 1000 rows of completely synthetic data and it only took 12 seconds. -# Conclusion +## Conclusion In this guide, we walked through how to seed your Neon database with 1000 rows of synthetic data using Neosync. This is just a small test and you can expand this to generate tens of thousands or more rows of data across any relational database. Neosync handles the referential integrity. This is particularly helpful if you're working on a new application and don't have data yet or want to augment your existing database with more data for performance testing. diff --git a/marketing/content/blog/neosync-neon-sync-job.mdx b/marketing/content/blog/neosync-neon-sync-job.mdx index dbcbf3fc02..288ccf414f 100644 --- a/marketing/content/blog/neosync-neon-sync-job.mdx +++ b/marketing/content/blog/neosync-neon-sync-job.mdx @@ -1,6 +1,6 @@ --- title: How to Anonymize Sensitive Data in Neon -description: A walkthrough tutorial on how to anonymize sensitive data in Neon DB +description: A walkthrough tutorial on how to anonymize sensitive data in Neon DB for a better developer experience while working with Neon. date: 2024-02-21 published: true image: /images/blogs/neon/neon-sync.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction [Neon DB](https://neon.tech/) is a fast-growing database that offer serverless Postgres. It's open source and gaining a lot of traction with individual developers working on side projects as well as businesses running mission critical applications. @@ -18,14 +18,14 @@ If you haven't already done so, follow the [Seeding your Neon DB with Synthetic Let's jump in. -# Prerequisites +## Prerequisites We're going to need a Neon account and a Neosync account. If you don't already have those, we can get those here: - [Sign up for Neon](https://console.neon/tech/login) - [Sign up for Neosync](https://www.neosync.dev) -# Setting up Neon +## Setting up Neon Now that we have our accounts, we can get this ball rolling. First, let's log into Neon. If you already have a Neon account then you can either create a new project or a new database. If you don't have a Neon account then give your project a name, your database a name and select a region like below: @@ -77,11 +77,11 @@ Nice! Okay, last step for Neon. Let's get our connection strings so that we can connect to our Neon databases from Neosync. We can find our connection string by going to **Dashboard** and then under the **Connection String** header, you can find your connection string. Use the **Database** drop down to change databases so that you can get both connection strings. Hold onto this for a minute while we get Neosync set up. -# Setting up Neosync +## Setting up Neosync Now that we're in Neosync, we'll want to first create connections to our Neon database and then create a job to sync data. Let's get started. -## Creating a Connection +### Creating a Connection Navigate over to Neosync and [login](https://app.neosync.dev). Once you're logged in, go to to **Connections** -> **New Connection** then click on **Postgres**. @@ -106,7 +106,7 @@ Let's click **Submit** and repeat this for our database so that we have two conn ![neosync-test](/images/blogs/neon/neonconnections.png) -## Creating a Job +### Creating a Job In order to generate data, we need to create a **Job** in Neosync. Let's click on **Job** and then click on **New Job**. We're now presented with two options: @@ -179,6 +179,6 @@ This is our destination: We can see that we generated new UUIDs, street addresses, cities, zipcodes and prices. We had set the state and loyalty_plan columns to pass through and those came through. Looking good! -# Conclusion +## Conclusion In this guide, we walked through how to anonymize sensitive data and generate synthetic data from one Neon database to another. The cool thing about this is that it doesn't have to be from one Neon database to another. Neosync supports any Postgres database. So it can be from Neon to RDS, RDS to Neon, RDS to Cloud SQL, etc. This is just a small test and you can expand this to anonymize millions or more rows of data across any relational database. Neosync handles all of the referential integrity. If you're working with sensitive data and want a better way to protect that data, then consider Neosync to take on the heavy lifting. diff --git a/marketing/content/blog/neosync-rds-data-gen-job.mdx b/marketing/content/blog/neosync-rds-data-gen-job.mdx index 84634ea1ea..ddf8398a3f 100644 --- a/marketing/content/blog/neosync-rds-data-gen-job.mdx +++ b/marketing/content/blog/neosync-rds-data-gen-job.mdx @@ -1,6 +1,6 @@ --- title: How to seed your AWS RDS DB with Synthetic Data -description: A walkthrough tutorial on how to seed an RDS DB with Synthetic Data +description: A walkthrough tutorial on how to seed an RDS with Synthetic Data for a better developer experience while working with AWS RDS. date: 2024-03-20 published: true image: /images/blogs/rds/rds-header.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction [AWS RDS](https://aws.amazon.com/rds/) is a relational database service from AWS that is easy to set up and scale. It's used by thousands of companies across the world. @@ -16,14 +16,14 @@ In this guide, we're going to walk through how you can seed your RDS database wi Let's jump in. -# Prerequisites +## Prerequisites We're going to need an AWS account and a Neosync account. If you don't already have those, we can get those here: - [Sign up for AWS](https://aws.amazon.com/rds/) - [Sign up for Neosync](https://www.neosync.dev) -# Setting up RDS +## Setting up RDS Now that we have our accounts, we can get this ball rolling. First, let's log into our AWS console and create a new RDS instance. For this guide, we're going to create a PostgreSQL database and select basic options that will suffice for a Dev DB. If you're creating a production database, you'll likely want to optimize your RDS instance for better memory, security and Storage. @@ -60,11 +60,11 @@ We can do a quick sanity check by querying to see that our table was successfull Nice! Easy enough to get started. -# Setting up Neosync +## Setting up Neosync Next let's set up Neosync. -## Creating a Connection +### Creating a Connection Navigate over to Neosync and [login](https://app.neosync.dev). Once you're logged in, go to to **Connections** -> **New Connection** then click on **Postgres**. @@ -87,7 +87,7 @@ Once you've completed filling out the form, you can click on **Test Connection** Let's click **Submit** and move onto the last part. -## Creating a Job +### Creating a Job In order to generate data, we need to create a **Job** in Neosync. Let's click on **Job** and then click on **New Job**. We're now presented with two options: @@ -144,6 +144,6 @@ SELECT * FROM users; Looking pretty good! We have seeded our RDS database with 1000 rows of completely synthetic data and it only took 1 second. -# Conclusion +## Conclusion In this guide, we walked through how to seed your RDS database with 1000 rows of synthetic data using Neosync. This is just a small test and you can expand this to generate tens of thousands or more rows of data across any relational database. Neosync handles the referential integrity. This is particularly helpful if you're working on a new application and don't have data yet or want to augment your existing database with more data for performance testing. diff --git a/marketing/content/blog/neosync-rds-sync-job.mdx b/marketing/content/blog/neosync-rds-sync-job.mdx index ce6f48227d..edac6b1adc 100644 --- a/marketing/content/blog/neosync-rds-sync-job.mdx +++ b/marketing/content/blog/neosync-rds-sync-job.mdx @@ -1,6 +1,6 @@ --- title: How to Anonymize Sensitive Data in RDS -description: A walkthrough tutorial on how to anonymize sensitive data in RDS DB +description: A walkthrough tutorial on how to anonymize sensitive data in RDS for a better developer experience while working with AWS RDS. date: 2024-03-21 published: true image: /images/blogs/rds/rds-sync-hero.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction [AWS RDS](https://aws.amazon.com/rds/) is a relational database service from AWS that is easy to set up and scale. It's used by thousands of companies across the world. @@ -18,14 +18,14 @@ If you haven't already done so, follow the [Seeding your RDS DB with Synthetic D Let's jump in. -# Prerequisites +## Prerequisites We're going to need an AWS account and a Neosync account. If you don't already have those, we can get those here: - [Sign up for AWS](https://aws.amazon.com/rds/) - [Sign up for Neosync](https://www.neosync.dev) -# Setting up RDS +## Setting up RDS Now that we have our accounts, we can get this ball rolling. First, let's log into our AWS console and create a new RDS instance. For this guide, we're going to create a PostgreSQL database and select basic options that will suffice for a Dev DB. If you're creating a production database, you'll likely want to optimize your RDS instance for better memory, security and Storage. @@ -76,11 +76,11 @@ Nice! Nice! Easy enough to get started. -# Setting up Neosync +## Setting up Neosync Now that we're in Neosync, we'll want to first create connections to our RDS databases and then create a job to sync data. Let's get started. -## Creating a Connection +### Creating a Connection Navigate over to Neosync and [login](https://app.neosync.dev). Once you're logged in, go to to **Connections** -> **New Connection** then click on **Postgres**. @@ -105,7 +105,7 @@ Once you've completed filling out the form, you can click on **Test Connection** Let's click **Submit** and repeat this for our database so that we have two connections: one for **aws-source** and one for **aws-dest**. -## Creating a Job +### Creating a Job In order to generate data, we need to create a **Job** in Neosync. Let's click on **Job** and then click on **New Job**. We're now presented with two options: @@ -162,6 +162,6 @@ This is our destination: We can see that we generated new first and last names, we anonymized the email address username but preserved the domain and anonymized our age. Nice! -# Conclusion +## Conclusion In this guide, we walked through how to anonymize sensitive data and generate synthetic data from one Supabase database to another. The cool thing about this is that it doesn't have to be from one Supabase database to another. Neosync supports any Postgres database. So it can be from Supabase to RDS, RDS to Supabase, RDS to Cloud SQL, etc. This is just a small test and you can expand this to anonymize millions or more rows of data across any relational database. Neosync handles all of the referential integrity. If you're working with sensitive data and want a better way to protect that data, then consider Neosync to take on the heavy lifting. diff --git a/marketing/content/blog/neosync-supabase-data-gen-job.mdx b/marketing/content/blog/neosync-supabase-data-gen-job.mdx index 553e7b07b4..d5bc2b25ad 100644 --- a/marketing/content/blog/neosync-supabase-data-gen-job.mdx +++ b/marketing/content/blog/neosync-supabase-data-gen-job.mdx @@ -1,6 +1,6 @@ --- title: How to seed your Supabase DB with Synthetic Data -description: A walkthrough tutorial on how to seed a Supabase DB with Synthetic Data +description: A walkthrough tutorial on how to seed a Supabase DB with Synthetic Data for a better developer experience while working with Supabase. date: 2024-03-05 published: true image: /images/blogs/supabase/supabaseheader.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction [Supabase](https://supabase.com/) is an open source Firebase alternative that helps you get started quickly with a backend. Supabase gets you started with a Postgres database, authentication, APIs and more. Thousands of developers use Supabase for all sizes of projects. @@ -16,14 +16,14 @@ In this guide, we're going to walk through how you can seed your Supabase databa Let's jump in. -# Prerequisites +## Prerequisites We're going to need a Supabase account and a Neosync account. If you don't already have those, we can get those here: - [Sign up for Supabase](https://supabase.com/) - [Sign up for Neosync](https://www.neosync.dev) -# Setting up Supabase +## Setting up Supabase Now that we have our accounts, we can get this ball rolling. First, let's log into Supabase. If you already have a Supabase account then you can either create a new project or use an existing project. If you don't have a Supabase account then give your database a name, type in a password and select a region like below: @@ -54,11 +54,11 @@ Nice! Okay, last step for Supabase. Let's get our connection string so we can co ![sb-created-tables](/images/blogs/supabase/sb-con.png) -# Setting up Neosync +## Setting up Neosync Now that we're in Neosync, we'll want to first create a connection to our Supabase database and then create a job to generate data. Let's get started. -## Creating a Connection +### Creating a Connection Navigate over to Neosync and [login](https://app.neosync.dev). Once you're logged in, go to to **Connections** -> **New Connection** then click on **Postgres**. @@ -83,7 +83,7 @@ As a sidenote, if you wanted to configure SSL mode, you can do that in the Supab Let's click **Submit** and move onto the last part. -## Creating a Job +### Creating a Job In order to generate data, we need to create a **Job** in Neosync. Let's click on **Job** and then click on **New Job**. We're now presented with two options: @@ -140,6 +140,6 @@ SELECT * FROM users; Looking pretty good! We have seeded our Supabase database with 1000 rows of completely synthetic data and it only took 3 seconds. -# Conclusion +## Conclusion In this guide, we walked through how to seed your Supabase database with 1000 rows of synthetic data using Neosync. This is just a small test and you can expand this to generate tens of thousands or more rows of data across any relational database. Neosync handles the referential integrity. This is particularly helpful if you're working on a new application and don't have data yet or want to augment your existing database with more data for performance testing. diff --git a/marketing/content/blog/neosync-supabase-sync-job.mdx b/marketing/content/blog/neosync-supabase-sync-job.mdx index 5c3988f37a..bf6a3a7bee 100644 --- a/marketing/content/blog/neosync-supabase-sync-job.mdx +++ b/marketing/content/blog/neosync-supabase-sync-job.mdx @@ -1,6 +1,6 @@ --- title: How to Anonymize Sensitive Data in Supabase -description: A walkthrough tutorial on how to anonymize sensitive data in Supabase +description: A walkthrough tutorial on how to anonymize sensitive data in Supabase for a better developer experience while working with Supabase. date: 2024-03-06 published: true image: /images/blogs/supabase/sb-sync-header.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction [Supabase](https://supabase.com/) is an open source Firebase alternative that helps you get started quickly with a backend. Supabase gets you started with a Postgres database, authentication, APIs and more. Thousands of developers use Supabase for all sizes of projects. @@ -18,14 +18,14 @@ If you haven't already done so, follow the [Seeding your Supabase DB with Synthe Let's jump in. -# Prerequisites +## Prerequisites We're going to need a Supabase account and a Neosync account. If you don't already have those, we can get those here: - [Sign up for Supabase](https://supabase.com/) - [Sign up for Neosync](https://www.neosync.dev) -# Setting up Supabase +## Setting up Supabase Now that we have our accounts, we can get this ball rolling. First, let's log into Supabase. If you already have a Supabase account then you can either create a new project or use an existing project. If you don't have a Supabase account then give your database a name, type in a password and select a region like below: @@ -67,11 +67,11 @@ Nice! Okay, last step for Supabase. Let's get our connection string so we can co ![sb-created-tables](/images/blogs/supabase/sb-dest-con.png) -# Setting up Neosync +## Setting up Neosync Now that we're in Neosync, we'll want to first create connections to our Supabase database and then create a job to sync data. Let's get started. -## Creating a Connection +### Creating a Connection Navigate over to Neosync and [login](https://app.neosync.dev). Once you're logged in, go to to **Connections** -> **New Connection** then click on **Postgres**. You can clone the existing source connection by clicking on the **supabase-source** connection and just updating the `Username` and `Password` or just create a new connection from scratch. @@ -96,7 +96,7 @@ Let's click **Submit** and repeat this for our database so that we have two conn ![neosync-test](/images/blogs/supabase/sb-connections.png) -## Creating a Job +### Creating a Job In order to generate data, we need to create a **Job** in Neosync. Let's click on **Job** and then click on **New Job**. We're now presented with two options: @@ -149,6 +149,6 @@ This is our destination: We can see that we generated new first and last names, we anonymized the email address username but preserved the domain and anonymized our age. Nice! -# Conclusion +## Conclusion In this guide, we walked through how to anonymize sensitive data and generate synthetic data from one Supabase database to another. The cool thing about this is that it doesn't have to be from one Supabase database to another. Neosync supports any Postgres database. So it can be from Supabase to RDS, RDS to Supabase, RDS to Cloud SQL, etc. This is just a small test and you can expand this to anonymize millions or more rows of data across any relational database. Neosync handles all of the referential integrity. If you're working with sensitive data and want a better way to protect that data, then consider Neosync to take on the heavy lifting. diff --git a/marketing/content/blog/pganonmizer-alternatives.mdx b/marketing/content/blog/pganonmizer-alternatives.mdx index cbd8c2c8b7..ab6b4b0a5c 100644 --- a/marketing/content/blog/pganonmizer-alternatives.mdx +++ b/marketing/content/blog/pganonmizer-alternatives.mdx @@ -1,6 +1,6 @@ --- title: Five Alternatives to PGAnonymizer -description: We review five alternative solutions to PG Anonymizer. +description: We review five alternative solutions to PG Anonymizer in-depth and help you pick which is the right one for your next project. date: 2024-02-26 published: true image: /images/blogs/pgalt.svg @@ -8,13 +8,13 @@ authors: - evis --- -# Introduction +## Introduction In our earlier [blog post](/blog/what-is-pg-anonymizer), we talked about Postgres Anonymizer or PGAnonymizer and how engineering teams can use it to anonymize sensitive data in their Postgres databases. While PG Anonymizer works well for many use cases, there are some use cases that it doesn't work so well. And in those cases, you may need an alternative. In this blog post, we're going to review some alternatives to PG Anonymizer and their strengths and weaknesses. Let's jump in. -# Neosync +## Neosync [Neosync](https://www.neosync.dev) is an open source synthetic test data platform that anonymizes and generates synthetic data and orchestrates it across environments. @@ -32,7 +32,7 @@ Let's jump in. - Early stage product - Limited RBAC -# Faker +## Faker One of the most commonly used open source libraries is [Faker](https://faker.readthedocs.io/en/master/). Faker started out as a Python library but has now been ported over to Golang, Javascript, C++ and other runtimes. Although not all distributions are equal in their flexibility and extensibility. We find that the Python runtime is still the most built-out. @@ -62,7 +62,7 @@ Though is a very bare bones implementation. In reality, you'll have to write a b - The effectiveness of anonymization depends on the developer's implementation - Not optimized for performance or generating large data sets of data -# YData +## YData [YData](https://ydata.ai/) is a startup that works with machine learning engineers and AI companies to help them generate synthetic data mainly for machine learning use-cases. They have a clean python SDK that is easy to use and can quickly generate synthetic data and run data quality checks. Additionally, they have a data cataloging tool that helps teams understand their data. @@ -79,7 +79,7 @@ Though is a very bare bones implementation. In reality, you'll have to write a b - Lack of anonymization and data masking features - Mainly support structured data and don't have extensive support for unstructured data -# Tonic.ai +## Tonic.ai [Tonic AI](https://tonic.ai) is a company that mainly focuses on creating and orchestrating test data for developers. They've been in the market since 2019 and are established in the space. They have a strong data anonymization feature set and support most databases. Let's take a look at their pros and cons. @@ -97,7 +97,7 @@ Though is a very bare bones implementation. In reality, you'll have to write a b - Don't have built out support for machine learning workflows - Not open source -# Gretel AI +## Gretel AI [Gretel AI](https://gretel.ai/) is another synthetic data company that is more similar to YData than Tonic. Gretel supports workflows for machine learning engineers and developers and can generate synthetic data for tabular and relational databases. @@ -114,6 +114,6 @@ Though is a very bare bones implementation. In reality, you'll have to write a b - Can't create your own custom anonymization using code - Not open source -# Wrapping up +## Wrapping up In this blog we covered a few alternatives to PG Anonymizer and their pros and cons. Depending on your use case, PG Anonymizer may work just fine, but if you need advanced data anonymization features, orchestration across databases and more control over our synthetic data than one of these alternative tools may do the job. diff --git a/marketing/content/blog/referential-integrity.mdx b/marketing/content/blog/referential-integrity.mdx index d473146dd9..c3f99aac54 100644 --- a/marketing/content/blog/referential-integrity.mdx +++ b/marketing/content/blog/referential-integrity.mdx @@ -1,6 +1,6 @@ --- title: What is Referential Integrity? -description: Discover what referential integrity is and why it's important in synthetic data. +description: Discover what referential integrity is and why it's important in relational databases, data anonymization and synthetic data. date: 2024-01-06 published: true image: https://assets.nucleuscloud.com/neosync/blog/ri.png @@ -8,23 +8,23 @@ authors: - evis --- -# Introduction +## Introduction Maintaining the integrity and accuracy of data within a database is critical. Understanding and implementing referential integrity is a crucial step in ensuring that data remains reliable and useful. This blog dives deep into referential integrity, its importance, and applications in both real-world databases and [synthetic data](/blog/synthetic-data-engineering). -# What is Referential Integrity +## What is Referential Integrity Referential integrity ensures that relationships in tables and between tables remain consistent as data is transformed or queried. This means that if you have a customer order in an "Orders" table, the customer ID for that order must actually exist in a "Customers" table. The customer ID in the "Orders" table would be a foreign key to the primary key customer ID in the "Customers" table. This relationship enforces data integrity and ensures that orders in the "Orders" table map to a customer in the "Customers" table. More generally, the primary table contains a primary key, a unique identifier for each record. The related table, on the other hand, includes a foreign key, which is a reference back to the primary key in the primary table. Referential integrity ensures that every foreign key in the related table matches an existing primary key in the primary table. If the primary key that the foreign key references was ever deleted then the foreign key, and as a result, the record, should also be deleted. -# Why is Referential Integrity important? +## Why is Referential Integrity important? Referential integrity is a key part in enforcing data accuracy within a given data sets. Especially in environments where there are many tables with complex relationships, referential integrity constraints provide a safety layer to ensure that records aren't being abandoned and data quality doesn't decrease. The less commonly talked about use-case of referential integrity is that it also improves developer productivity. Most databases, have a `CASCADE` command which allows the database to do the heavy lifting of cleaning up records across tables if you delete a record that has foreign keys to it. Imagine having to write a `DELETE FROM ...` statement for ever single table where a record might have a foreign key to another record. That would be painful! -# Referential Integrity in Databases +## Referential Integrity in Databases Referential integrity is usually associated with relational databases where relationships are enforced at the database layer through keys and constraints. NoSQL databases on the other hand don't handle referential integrity like relational databases do, instead they delegate that to the application layer. The caveat here being graph databases which encode relationships in the edges between nodes. @@ -37,12 +37,12 @@ We've mentioned a few ways that databases handle referential integrity in the se 5. Triggers - Triggers automatically check for certain conditions and acting when data is inserted, updated, or deleted. For example, a trigger could prevent deletion of a record if it would result in orphaned records in another table. 6. Stored procedures - Stored procedures encapsulate pre-prepared SQL code into well-defined transactions. They can perform multiple checks and operations atomically, ensuring that the database remains consistent.7. Transactions - Transactions are ways of ensuring that all of the steps of an action are completed to ensure data integrity before fully adding, deleting, updating, etc the record to a database. -# Referential Integrity in Synthetic data +## Referential Integrity in Synthetic data Referential integrity is critical to ensuring that your data matches your database schema and doesn't break your constraints. When you're creating synthetic data, it's important to account for the table constraints that we mentioned AnimationPlaybackEvent, otherwise you'll have issues inserting data into your schema. This goes a step further if you're doing [subsetting](/blog/subset-referential-integrity). If you want to subset your data, you have to ensure that you're not breaking any of your table constraints as well. At [Neosync](https://www.neosync.dev) we automatically handle referential integrity across all databases and tabes to ensure that your schema is never broken. -# Wrapping up +## Wrapping up Referential integrity is a key component of relational databases where certain columns are linked to other columns in other tables, or even in a single table. It's also a key component of creating synthetic data that can be used for testing applications and training machine learning models. Ultimately, the goal is to enforce data quality and integrity. diff --git a/marketing/content/blog/subset-referential-integrity.mdx b/marketing/content/blog/subset-referential-integrity.mdx index 63aaa40ba0..6d3e9ecf47 100644 --- a/marketing/content/blog/subset-referential-integrity.mdx +++ b/marketing/content/blog/subset-referential-integrity.mdx @@ -1,6 +1,6 @@ --- title: Introducing Subsetting with Referential Integrity -description: A technical overview on how we implemented referential integrity subsetting. +description: A technical overview on how we implemented referential integrity within our subsetting features and how you can use for a better developer experience. date: 2024-04-02 published: true image: /images/blogs/subsetref.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction We recently shipped Subsetting with [Referential Integrity](/blog/referential-integrity) in Neosync and we wanted to give an overview of how we built referential integrity into subsetting. We expose the ability to use referential integrity in your Subsetting page through a simple switch component but under the covers there is a lot of logic and code to make this happen. @@ -16,7 +16,7 @@ We recently shipped Subsetting with [Referential Integrity](/blog/referential-in In this blog, we're going to walk through how we implemented referential integrity into our Subsetting feature. If you like graph problems, then this blog is for you. -# What is subsetting +## What is subsetting Subsetting is a technique that developers can use to shrink their database by taking a fraction of the data in their source database and moving it to their destination database. There are two main use cases for subsetting. @@ -24,13 +24,13 @@ The first is debugging errors. Say that a customers is experiencing a bug and yo The second use-case is for reducing the size of the data that you're copying across databases. If your production database is 5TB and you want to do some work locally, it's unlikely that your local database will be able to hold that much data. Subsetting is a great way to "shrink" your production database to one that is usable locally. -# Clarifying the requirements +## Clarifying the requirements The requirements for referential integrity in subsetting are actually pretty straightforward. The user story is: As a user, I want to be able to subset my database using one or `WHERE` clauses and get a dataset back that contains all of the relevant and referred data. Functionally this means that we need to understand the relationships between tables in order to select and filter the data to just the rows that we care about while accounting for primary keys, foreign keys and other dependencies. -# Defining the experience +## Defining the experience Whenever we design a new feature, we try to follow our design philosophy: flexibility without complexity. This means designing a feature that is super simple to use but is also really flexible and powerful. Here is the interface to the subsetting page: @@ -40,7 +40,7 @@ It's broken down into 3 sections. The first is the switch at the top that allows The flexibility part comes into play in the second and third sections where the user can add in multiple subsetting queries per table or across tables and validate those queries against the database. This is quite unique to the Neosync platform and we haven't seen anyone else take this approach to subsetting. The lack of complexity is manifested in a single switch that the user has to select to enable referential integrity. -# Implementing referential integrity +## Implementing referential integrity We can broadly break down referential integrity into two types: linear dependencies and circular dependencies. @@ -48,7 +48,7 @@ We can broadly break down referential integrity into two types: linear dependenc In the following sections, we'll break both of these types down and explain the logic behind how we implemented this. -## Linear dependencies +### Linear dependencies Linear dependencies move in one direction and do not form an enclosed loop. In classic computer science language, this would be called a Directed Acyclic Graph (DAG). It's Acyclic because there is no loop formed. In the example above, `table a` points to `table b` and `table c` but no closed loop is formed. This is pretty common in most databases where you have a table that references another table with one or more foreign keys. @@ -72,7 +72,7 @@ If we were to insert the data in any order, our foreign keys would complain that The other wrinkle to mention here is if the user wants to transform their primary keys or data that has a foreign key. Think of a foreign key on an email address. If you want to anonymize that data, then you need to track the transformed output from the input in order to insert the correct value. But that's for another blog post for another day. -## Circular dependencies +### Circular dependencies ![subsetpage](/images/blogs/circle.png) @@ -87,7 +87,7 @@ Similar to the linear dependencies, a user would provide us with one or multiple The added layer of complexity here is in using recursive functions to manage the circular nature of these relationships. And just as in the linear dependencies, we have to be careful to insert the data in the correct order (direction) otherwise our foreign keys will panic about missing references. -# Conclusion +## Conclusion Implementing referential integrity in subsetting was a fun challenge to take on. As a team, we spent a lot of time drawing boxes and arrows on a whiteboard and thinking through the edge cases. We also implemented this for Mysql and Postgres which added an extra layer of difficult since Mysql and Postgres have different syntax and functionality. diff --git a/marketing/content/blog/synthetic-data-encryption-tokenization.mdx b/marketing/content/blog/synthetic-data-encryption-tokenization.mdx index 2a134e9c40..e7490bc242 100644 --- a/marketing/content/blog/synthetic-data-encryption-tokenization.mdx +++ b/marketing/content/blog/synthetic-data-encryption-tokenization.mdx @@ -1,6 +1,6 @@ --- title: What is the difference between Synthetic Data, Encryption and Tokenization? -description: What is the difference between synthetic data, encryption and tokenization and what are their use cases? +description: We take a deep dive and explore the difference between synthetic data, encryption and tokenization and their use cases from a data security perspective. date: 2024-02-06 published: true image: /images/blogs/est.png @@ -8,19 +8,19 @@ authors: - evis --- -# Introduction +## Introduction For any engineer working with data, it's important that they understand the tools that are available to them to protect data and when to use them. Especially, in today's world, as more of the traditional security and data privacy work is shifting left toward developers. Three of the most effective methods to protect data are: encryption, tokenization and synthetic data. In this blog we introduce all three, talk about their use-cases and compare and contrast them. Let's jump in. -# Synthetic Data +## Synthetic Data Synthetic data is getting more and more attention these days with the rise of AI/ML and LLMs. Increasingly, more companies are using synthetic data for security and privacy reasons as well as to train models. We think of this as [Synthetic Data Engineering](/blog/synthetic-data-engineering). In the simplest definition, synthetic data is data that a machine has completely made up from scratch. For example, you can program a pseudo-random number generator (PRNG) to randomly select 5 numbers between 0 and 25. You can then use these randomly selected numbers as indexes in the alphabet to randomly select 5 letters. If you put those 5 letters together, you've created a synthetic string! Obviously, this is a very simple example but the point remains. You can write programs to create data that "looks" just like real data. Additionally, using machine learning models such as generative adversarial networks or GANs, you can create synthetic data that has the same statistical characteristics as your real data. There are a number of synthetic data generators available that address different use cases. The key is to balance generation speed and accuracy. Some use cases such as analytics call for more accuracy (statistically) while other use cases, developer testing, call for more generation speed. -## Synthetic Data Use cases +### Synthetic Data Use cases [Synthetic data](/blog/top-4-usecases-synthetic-data) is being used by developers to build applications and machine learning engineers to train models. For developers, synthetic data is helpful: @@ -31,7 +31,7 @@ Obviously, this is a very simple example but the point remains. You can write pr We're seeing new use cases come up for synthetic data all of the time as more attention and time is being spent on methods to generate higher-quality synthetic data for developers and ML engineers. -# Encryption +## Encryption Encryption has been around for decades and is a primary method for protecting data at rest and in transit. There are generally two ways to encrypt data: asymmetric encryption and symmetric encryption. The main difference is that in asymmetric encryption, you use a public key to encrypt data and then a primary key to decrypt data while in symmetric encryption, the same key is used to encrypt and decrypt data. There are pros and cons to both of these approaches and in another blog post we'll discuss what those pros and cons are. In today's world, we generally see more asymmetric encryption because it's more secure than we see symmetric encryption. @@ -46,7 +46,7 @@ Encryption is widely used for a number of use cases. Here are some of the most c 3. **VPNs** - creates a secure and encrypted connection over a less secure network, such as the internet, ensuring privacy and protection from eavesdropping. Most corporations use some type of VPN as a way to protect their network traffic. 4. **Authentication** - Uses encryption to verify the authenticity of digital documents and messages, ensuring that they have not been altered and confirming the identity of the sender. -# Tokenization +## Tokenization Tokenization is encryption's lesser known cousin that can be even more secure than encryption! Tokenization is the process of converting a piece of data to another representation by using a look-up table of pre-generated tokens that have no relation to the original data. Another way to think about tokenization is to imagine a casino. When you go into a casino, you exchange cash for chips then use those chips in the casino and then when you're done, you swap the chips for cash again. Tokenization works in a similar way. You have some data that you give to a look up table, the look up table returns back a randomly generated token, then you can use that data until you're ready to swap it back for the original data. @@ -62,7 +62,7 @@ Tokenization is widely used to protect sensitive data in financial services and 2. **Third party data sharing** - tokenization can be used to protect data for third party data sharing. For example, you can tokenize certain columns in data set, such as name, age, email and then share that data set with an untrusted third party. They can process that data without being able to see the sensitive data, and then you can de-tokenize the sensitive data for your processing. 3. **Securing PII data** - tokenization can be a powerful tool for internal data tokenization to ensure that sensitive data is secure as it travels across your internal systems. -# Compare and Contrast +## Compare and Contrast Now that we've understood what encryption, tokenization and synthetic data are, let's look at the differences and similarities and better understand their use cases. @@ -77,6 +77,6 @@ Now that we've understood what encryption, tokenization and synthetic data are, | Risk of Data Exposure | Low (no direct link to real data) | High (if encryption key is compromised) | Low (tokens are not meaningful) | | Regulatory Compliance | Can aid in compliance by avoiding use of real data | Required for data protection laws | Often used for PCI-DSS, GDPR compliance | -# Wrapping up +## Wrapping up Encryption, tokenization and synthetic data have similarities but also have key differences that are important for engineering and security teams to understand. These are all tools that every developer should be able to use given the right use cases. Hopefully, this was a good intro into the differences between encryption, tokenization and synthetic data. diff --git a/marketing/content/blog/synthetic-data-engineering.mdx b/marketing/content/blog/synthetic-data-engineering.mdx index f7b7a2d0fb..c06e8daffb 100644 --- a/marketing/content/blog/synthetic-data-engineering.mdx +++ b/marketing/content/blog/synthetic-data-engineering.mdx @@ -1,6 +1,6 @@ --- title: The Future is Synthetic Data Engineering -description: Our thoughts on why Synthetic Data Engineering is the future +description: Synthetic Data Engineering is the feature and in this blog we describe how it can massively improve developer experience and privacy. date: 2024-01-15 published: true image: https://assets.nucleuscloud.com/neosync/blog/synthetic-data 1.png @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction For most companies, sensitive data is radioactive. If put to use correctly, it can be powerful, helping you to understand your customers and power personalized experiences. If mishandled, it can be extremely damaging. Our infrastructures are contaminated with this radioactivity. @@ -24,19 +24,19 @@ So how do we de-contaminate our infrastructures? The answer is Synthetic Data Engineering. -# Synthetic Data Engineering +## Synthetic Data Engineering Synthetic Data Engineering replaces traditional sensitive data with synthetic data in all engineering workflows where real data is not absolutely required. When we can create synthetic data that is almost structurally and statistically identical to our production data, we can reduce the flow and usage of sensitive production data throughout our infrastructure, and not only increase our security and privacy posture but also be more productive. Let's look at some use-cases. -## Product Engineering +### Product Engineering Product engineers are typically building features for customers. In order to build and test those features, product engineers will either run a database locally or connect to a staging database thats in the cloud. These testing databases need high-quality [test data](/blog/what-is-test-data) in order to replicate real world scenarios and ensure that the features are stable. Many companies will just replicate their production database to their staging database or even to the developers local database. This is obviously not great from a security and privacy standpoint and in some industries, like healthcare where HIPAA reigns strong, not allowed. Synthetic Data Engineering for product engineers means that they can have a local or staging database with synthetic data that looks just like their production database without any of the security and privacy concerns. They can tear down and re-hydrate this database over and over again without touching back up copies or a live database. Also, they can test their features with different levels of data quality to ensure that they're handling edge cases. Overall, this creates a safer and more reliable application in the long run. -## Machine learning +### Machine learning If you ask a Machine Learning Engineer or Data Scientist what their main problems are, you'll most likely hear some version of the following: @@ -49,7 +49,7 @@ For Machine Learning Engineers and Data Scientists who are training models and s This is especially powerful in regulated industries where sensitive data is sometimes not even available to be used by Machine Learning Engineers and Data Scientists. -## Data Engineering +### Data Engineering Data Engineers spend most of their days building and maintaining pipelines that ingest, transform and move data across the organization. The two biggest problems that Data Engineers face is getting enough data to test the stability and performance of their pipelines and getting data representative enough to test out the business logic. @@ -57,7 +57,7 @@ Synthetic data engineering comes to the rescue here again. Data Engineers are no Additionally, it's all self-serve. Whether you're a Developer, Machine Learning Engineer, Date Scientist or Data Engineer, you can define the data set you need and get the data generated automatically without having to go through long review cycles. -# Core Features of Synthetic Data Engineering +## Core Features of Synthetic Data Engineering We've talked about what Synthetic Data Engineering is and how it can help engineering teams and companies move faster with less risk. Now let's talk about what the core features of Synthetic Data Engineering. @@ -73,7 +73,7 @@ Synthetic Data Engineering revolves around four key concepts: Synthetic Data Engineering starts from these core concepts and then can expand into other areas such as data anonymization, tokenization and more. -# Conclusion +## Conclusion Driving a new architecture is an audacious goal. But it's one that we strongly believe in. We imagine a world where Developers, Machine Learning Engineers and Data Engineers have unlimited access to high-fidelity data that is structurally and statistically identical to their production data yet doesn't have any of the security and privacy risk. diff --git a/marketing/content/blog/synthetic-data-tokenization.mdx b/marketing/content/blog/synthetic-data-tokenization.mdx index 5ba53d034f..418d68d1c1 100644 --- a/marketing/content/blog/synthetic-data-tokenization.mdx +++ b/marketing/content/blog/synthetic-data-tokenization.mdx @@ -1,6 +1,6 @@ --- title: LLM Data Privacy - Synthetic Data vs. Tokenization? -description: The best way to protect sensitive data in LLMS - synthetic data and tokenization? +description: What is the best way to protect sensitive data in LLMS - synthetic data and tokenization? We take an in-depth look at the two options. date: 2024-04-23 published: true image: /images/blogs/syndatatoken.svg @@ -8,7 +8,7 @@ authors: - evis --- -# Introduction +## Introduction AI data privacy is a hot topic these days and there are two emerging ways of protecting sensitive data when working with LLMs: synthetic data and tokenization. @@ -16,13 +16,13 @@ In this blog, we'll cover both synthetic data and tokenization and their use-cas Let's jump in. -# Synthetic Data +## Synthetic Data Synthetic data is getting more and more attention these days with the rise of AI/ML and LLMs. It's being used by most popular foundation models to train them such aws [Microsoft's PHI-3 mode](https://arxiv.org/abs/2404.14219). Increasingly, more companies are using synthetic data for security and privacy reasons as well as to train models. We think of this as [Synthetic Data Engineering](/blog/synthetic-data-engineering). In the simplest definition, synthetic data is data that a machine has completely made up from scratch. For example, you can program a pseudo-random number generator (PRNG) to randomly select 5 numbers between 0 and 25. You can then use these randomly selected numbers as indexes in the alphabet to randomly select 5 letters. If you put those 5 letters together, you've created a synthetic string! Obviously, this is a very simple example but the point remains. You can write programs to create data that "looks" just like real data. Additionally, using machine learning models such as generative adversarial networks or GANs, you can create synthetic data that has the same statistical characteristics as your real data. There are a number of synthetic data generators available that address different use cases. The key is to balance generation speed and accuracy. Some use cases such as analytics call for more accuracy (statistically) while other use cases, developer testing, call for more generation speed. -## Synthetic Data Use cases +### Synthetic Data Use cases [Synthetic data](/blog/top-4-usecases-synthetic-data) is being used by developers to build applications and machine learning engineers to train models. For developers, synthetic data is helpful: @@ -33,7 +33,7 @@ Obviously, this is a very simple example but the point remains. You can write pr We're seeing new use cases come up for synthetic data all of the time as more attention and time is being spent on methods to generate higher-quality synthetic data for developers and ML engineers. -# Tokenization +## Tokenization Tokenization is encryption's lesser known cousin! Tokenization is the process of converting a piece of data to another representation by using a look-up table of pre-generated tokens that have no relation to the original data. Another way to think about tokenization is to imagine a casino. When you go into a casino, you exchange cash for chips then use those chips in the casino and then when you're done, you swap the chips for cash again. Tokenization works in a similar way. You have some data that you give to a look up table, the look up table returns back a randomly generated token, then you can use that data until you're ready to swap it back for the original data. @@ -41,7 +41,7 @@ The main difference between tokenization and encryption is that encryption is re Lastly, similar to encryption, you can create different types of tokens that preserve the length, format and other characteristics of the input data. This is particularly useful for data processing such as lookups across databases. -## Tokenization use cases +### Tokenization use cases Tokenization is widely used to protect sensitive data in financial services and card networks and is starting to gain popularity in other use cases as well. @@ -49,7 +49,7 @@ Tokenization is widely used to protect sensitive data in financial services and 2. **Third party data sharing** - tokenization can be used to protect data for third party data sharing. For example, you can tokenize certain columns in data set, such as name, age, email and then share that data set with an untrusted third party. They can process that data without being able to see the sensitive data, and then you can de-tokenize the sensitive data for your processing. 3. **Securing PII data** - tokenization can be a powerful tool for internal data tokenization to ensure that sensitive data is secure as it travels across your internal systems. -# Compare and Contrast +## Compare and Contrast Now that we've understood what encryption, tokenization and synthetic data are, let's look at the differences and similarities and better understand their use cases. @@ -64,7 +64,7 @@ Now that we've understood what encryption, tokenization and synthetic data are, | Risk of Data Exposure | Low (no direct link to real data) | Low (tokens are not meaningful) | | Regulatory Compliance | Can aid in compliance by avoiding use of real data | Often used for PCI-DSS, GDPR compliance | -# LLM data privacy +## LLM data privacy One of the main use-cases that we see with regards to data privacy and LLMs is the ability to anonymize or generate synthetic data for training or fine-tuning use-cases. In those situations a machine learning engineer can use either synthetic data or tokenization. What we know today is that data that is used to train LLMs is tokenized and vectorized before it's trained. This results in a high-dimensional graph where distances between tokens indict similarity. @@ -82,7 +82,7 @@ Let's take a simple scenario. If I have a database with 3 columns and 3 rows of Let's take a look at what this looks like with synthetic data and with tokenization. -## Synthetic data +### Synthetic data Given that our goal is to protect sensitive data without losing the semantic meaning of that data, we can create synthetic data that looks just like it but is synthetic! Here's what it would look like: @@ -96,7 +96,7 @@ We've updated our PII to generate new emails, first names and last names and the That's the power of synthetic data. You're able to generate net new data that looks like our sensitive data and maintains generally the same semantic meaning but is privacy-safe. -## Tokenization +### Tokenization Looking at tokenization, there are several different types of tokenization. Length-preserving and format-preserving tokenization are powerful features that mimic the format and length of the original data but the data itself is not retained. Let's see: @@ -108,6 +108,6 @@ Looking at tokenization, there are several different types of tokenization. Leng Tokenization can still anonymize sensitive data however you will likely lose the semantic meaning of the data since the tokens it generates are typically nonsense. For some use-cases this might be fine, but every bit of noise you add to the model, you make the model less accurate. So there is a trade-off and tokenization tends to add more noise than synthetic data. -# Wrapping up +## Wrapping up Tokenization and synthetic data have similarities but also have key differences that are important for engineering and security teams to understand. As LLMs become more widely used, it's important that developers and machine learning engineers understand the tools they have available and when/how to use them. Generally, when interacting with machine learning models that care about semantic meaning, synthetic data is a great choice. diff --git a/marketing/content/blog/terraform-support.mdx b/marketing/content/blog/terraform-support.mdx index 7c9195d462..4668eccc71 100644 --- a/marketing/content/blog/terraform-support.mdx +++ b/marketing/content/blog/terraform-support.mdx @@ -1,6 +1,6 @@ --- title: Neosync + Terraform -description: Introducing the Neosync Terraform provider. +description: Introducing the Neosync Terraform provider for developers and teams that are following a GitOps approach to managing their infrastructure. date: 2024-02-22 published: true image: /images/blogs/terraform.svg @@ -8,11 +8,11 @@ authors: - evis --- -# Introduction +## Introduction Terraform is one of the leading Infrastructure-as-Code (IaC) tools that is used by thousands of companies to declaratively manage their infrastructure. So it's no surprise that many in the [Neosync](https://github.com/nucleuscloud/neosync) open source community were asking us to support it. So, we're excited to launch our official Terraform Provider! If you want to get started right away with it, you find it [here](https://registry.terraform.io/providers/nucleuscloud/neosync/latest) on the Terraform provider directory along with some helpful additional [docs here](https://docs.neosync.dev/guides/terraform). -# Using Terraform to Manage Neosync +## Using Terraform to Manage Neosync Before we added Terraform support, developers could interface with Neosync through the web application and using our Golang and Typescript SDKs. Now that we have a Terraform provider, developers and devOps teams have an additional way of managing their Neosync infrastructure. @@ -22,7 +22,7 @@ Using our Terraform provider, teams can now easily create new Connections, Trans 2. For teams that follow GitOps methodologies, this approach aligns with the rest of their Infrastructure 3. Gives teams greater flexibility in creating new resource in Neosync that other teams may need -# Diving into the Provider +## Diving into the Provider The best way to understand how to use Terraform to manage your Neosync infrastructure is to look at some examples. @@ -68,6 +68,6 @@ If you're setting up the provider for the first time, then you'll want to jump i For more examples and documentation check out the [provider documentation](https://registry.terraform.io/providers/nucleuscloud/neosync/latest/docs/resources/connection). -# Conclusion +## Conclusion Supporting Terraform is something that we talked about doing for months and when our open source community started asking for it, it was the perfect time to prioritize it. We're excited to have support for it and see how developers and devOps teams use it to more easily manage their infrastructure. diff --git a/marketing/content/blog/top-4-usecases-synthetic-data.mdx b/marketing/content/blog/top-4-usecases-synthetic-data.mdx index 77746f8cb0..9c3e4e3825 100644 --- a/marketing/content/blog/top-4-usecases-synthetic-data.mdx +++ b/marketing/content/blog/top-4-usecases-synthetic-data.mdx @@ -1,6 +1,6 @@ --- title: Top 4 Use-Cases of Synthetic Data -description: What is synthetic data? How can engineers use synthetic data to build better applications and to train their models? +description: What is synthetic data? How can engineers use synthetic data to build better applications, train their models and improve their developer experience? date: 2023-12-10 published: true image: https://assets.nucleuscloud.com/neosync/blog/what-is-synthetic-data.png @@ -8,33 +8,33 @@ authors: - evis --- -# Introduction +## Introduction For many companies, especially startups, getting access to high quality data is difficult. If you're a small company, you likely don't have a lot of real-world data to use for testing your applications and infrastructure. If you're a large company, real-world data often comes with challenges such as privacy concerns, limited availability, and biases. This is where synthetic data comes in as a powerful alternative for small companies and enterprises. More broad [synthetic data engineering](/blog/synthetic-data-engineering) encompasses synthetic data generation, orchestration and more. In this blog, we're just goint to cover synthetic data generation and it's use cases. -# What is Synthetic Data? +## What is Synthetic Data? Synthetic data is artificially generated data that closely resembles real-world data but does not contain any actual personal information (PII) or any of the original data. It is created in many different ways depending on the type and format of data you need. If you just need basic integer data then something like a random number generator can be used to create a random number. If you need something more complicated like a fake hotel object that includes a name, description, room rates, pictures, etc. then generative models and deep learning algorithms might be required. The goal at the end of the day is to create data that "looks" exactly like the data that you would collect in the real world but is not sensitive and is easily created. -# What are the main use-cases for Synthetic Data? +## What are the main use-cases for Synthetic Data? Synthetic data is massively helpful in building and testing applications and training machine learning models among other use-cases. Let's go through the top 4 use-cases of synthetic data. -## 1. Testing and Validation +### 1. Testing and Validation Today most developers manually create test data. They'll hand write JSON or data into a database and then use that to test their applications. Outside of it being horribly inefficient, developers will usually forget to test for edge cases such as non-ASCII characters, ill-formatted text and more. This is where synthetic data can come to the rescue. Since it's easy and cheap to create, you can create different types of synthetic data that test the happy path as well as edge cases. Overall, this leads to a more resilient and secure application for your customers. -## 2. Performance Testing +### 2. Performance Testing For many companies, especially startups, getting data at scale isn't easy. If you haven't launched your product but want to see how it would perform under pressure, you need a lot of data to be able to replicate that traffic at scale. This is where synthetic data can be really useful. You can easily and quickly create millions of records to test your application and infrastructure and see if it handles the load. -## 3. Protect Data Privacy and Security +### 3. Protect Data Privacy and Security Sensitive data should be protected and not made available to anyone who needs in an organization. This includes engineer teams. So, then how does an engineer get representative data to test their applications? Synthetic data to the rescue! Synthetic data enables developers to build and test applications without requiring access to sensitive real-world data. This protects user privacy and complies with data privacy regulations. -## 4. Training AI and ML Models +### 4. Training AI and ML Models Nowadays, every company is using AI/ML and/or building their own AI/MLAI models. Synthetic data originated from the AI/ML world in order to help train models when engineers didn't have enough real-world data. Some of the main use-cases in AI/ML are: @@ -46,12 +46,12 @@ Nowadays, every company is using AI/ML and/or building their own AI/MLAI models. You can see these use-cases being put to play in most industries. From healthcare companies using synthetic data to train models to diagnose tumors to financial services companies using models to detect and quantify risk in their financial positions. -# How is synthetic data created? +## How is synthetic data created? Depending on the type of synthetic data that you NamedNodeMap, synthetic data can be created in different ways. If you need statistically consistent synthetic data, then a GAN model can help to create synthetic data. If you need more deterministic synthetic data, then use transformers like the ones that [Neosync](https://wwww.neosync.dev) provides can be used. At the end of the day, it's important to measure the quality of the synthetic data compared to the original data set and ensure that the data sets are aligned. -# Conclusion +## Conclusion As data privacy concerns continue to rise and the demand for AI/ML grows, developers will need to rely on synthetic data to be build and test their applications as well as train their AI/ML models. Luckily, as AI/ML models get better we can create even better synthetic data to build more resilient and smarter applications. diff --git a/marketing/content/blog/what-is-pg-anonymizer.mdx b/marketing/content/blog/what-is-pg-anonymizer.mdx index 9aad1c0c6c..f3e9e0d97e 100644 --- a/marketing/content/blog/what-is-pg-anonymizer.mdx +++ b/marketing/content/blog/what-is-pg-anonymizer.mdx @@ -1,6 +1,6 @@ --- title: A Comprehensive Guide to PGAnonymizer -description: Learn what Postgres Anonymizer is and how you can use it to anonymize data in Postgres. +description: Learn what Postgres Anonymizer is and how you can use it to anonymize data in Postgres for better data security and privacy. date: 2024-02-20 published: true image: /images/blogs/pganon.svg @@ -8,13 +8,13 @@ authors: - evis --- -# Introduction +## Introduction Postgres is one of the most popular databases in the world and it comes with a pretty extensive library of extensions and open source tools that the Postgres team and outside developers have created to make working with Postgres easier and safer. One of the more popular plugins is [Postgres Anonymizer](https://postgresql-anonymizer.readthedocs.io/en/stable/) or usually called PGAnonymizer. PGAnonymizer is used to anonymize your Postgres database. This is useful if you're working with sensitive data and want a way to protect the security and privacy of that data as developers are working with it. In this blog, we're going to dive deeper into PGAnonymizer, what it is, how it works and how you can install it and use it in your Postgres database. -# What is PGAnonymizer? +## What is PGAnonymizer? PGAnonymizer is a open source Python-based extension built to anonymize data within Postgres. It allows you to obfuscate sensitive information such as names, addresses, emails, and other personally identifiable details (PII). You can then use this data for testing and development purposes without compromising the security of real-world information. @@ -26,7 +26,7 @@ It's configured declaratively (more on this below) using the Postgres Data Defin Lastly, PGAnonymizer does come with some detection functions that will try to guess which columns need to be anonymized. This is a form of trying to understand which columns seem to hold sensitive data based on the name of the column. -# Use cases +## Use cases There are a number of use cases for data anonymization and creating [test data](/blog/what-is-test-data). Here are some of the main ones: @@ -36,7 +36,7 @@ There are a number of use cases for data anonymization and creating [test data]( - **Research**: Researchers can leverage anonymized datasets for analysis and training purposes without infringing upon individual privacy. Share research findings ethically without revealing underlying personal information. - **Data Minimization**: By anonymizing irrelevant data points, you can minimize the amount of sensitive information stored in your databases, reducing the attack surface and enhancing overall security. -# Anonymization Functions +## Anonymization Functions PGAnonymizer has a series of anonymization functions to transform sensitive data into anonymized data. You can implement one or multiple of these functions depending on how you want to anonymize your data. We can categorize the functions into a few buckets: @@ -48,7 +48,7 @@ PGAnonymizer has a series of anonymization functions to transform sensitive data The best way to understand how these functions work is by looking at a few examples. First let's install PGAnonymizer and then see how we can use it. -# Installation +## Installation Depending on your set up, you can install PGAnonymizer in a few different ways. In this example, we'll walk through how to do it locally but if you're using something like docker, you can follow [this guide](https://postgresql-anonymizer.readthedocs.io/en/stable/INSTALL/#install-with-docker). @@ -85,7 +85,7 @@ SELECT anon.init(); That's it! Easy enough to get started. Let's dig into some of the anonymization functions that PGAnonymizer has. -## Partial Scrambling +### Partial Scrambling Partial scrambling is a masking function that leaves out some part of the data. For instance : a credit card number can be replaced by '40XX XXXX XXXX XX96'. There are two function available for partial scrambling: @@ -94,7 +94,7 @@ Partial scrambling is a masking function that leaves out some part of the data. By passing in the email function, PGAnonymizer will recognize that it's an email address and partially scramble the username and domain which are the identifying elements of the email address. -## Static Masking Rules +### Static Masking Rules Static masking rules are ways of declaring rules that will mask an entire database. For each column in a table, you declare a rule that defines how you want to transform that data. @@ -116,7 +116,7 @@ Once you're ready to anonymize your table, you can run `SELECT anon.anonymize_da This is just the start of what you can do with PGAnonymizer. There are more functions and ways to anonymize, encrypt, hash and obfuscate data. For more information, check out the [documentation](https://postgresql-anonymizer.readthedocs.io/en/stable/). -# Limitations +## Limitations PGAnonymizer is pretty flexible but there are some limitations that you should be aware of. Here are the biggest ones: @@ -124,10 +124,10 @@ PGAnonymizer is pretty flexible but there are some limitations that you should b 2. **Metadata Leakage** - Depending on how you anonymize the data, anonymized data might still contain metadata(table names, column types, etc.) that could reveal sensitive information. 3. **No Referential Integrity** - Arguably the biggest limitation is that PGAnonymizer does not handle referential integrity. So if you decide to anonymize a primary key that has foreign key, then it will likely break that relation and cause problems in your database. -# PGAnonymizer vs Neosync +## PGAnonymizer vs Neosync PGanonymizer can be a great option depending on the use-case, but for those who need more power and flexibility, [Neosync](https://www.neosync.dev) takes PGAnonymizer to the next level by addressing one many of the weaknesses of PGAnonymizer. Namely, [referentially integrity](/blog/referential-integrity) and orchestration. Additionally, Neosync has a GUI and doesn't require the user to define everything in SQL which makes for a better developer experience. -# Conclusion +## Conclusion PGAnonymizer is a powerful Postgres extension that allows you to anonymize sensitive data making it usable for development and testing. It has a lot of flexibility but also has some limitations to consider. At the end of the day, the tool that fits your use case is the right tool for you and you should consider all available options to protect customer data and privacy. diff --git a/marketing/content/blog/what-is-platform-engineering.mdx b/marketing/content/blog/what-is-platform-engineering.mdx index 24b748152f..a92178a4e8 100644 --- a/marketing/content/blog/what-is-platform-engineering.mdx +++ b/marketing/content/blog/what-is-platform-engineering.mdx @@ -8,7 +8,7 @@ authors: - nick --- -# Intro +## Intro As consumers demand more capabilities from their software platforms, developers' jobs are becoming more complex. Rather than code every new feature from scratch, developers now rely on sophisticated internal platforms to work more efficiently and productively. @@ -18,7 +18,7 @@ The need for these internal, developer-facing platforms has led to the rise of a So what is platform engineering? Should your organization consider investing in this new role? What are the benefits to your development team? Read on to learn more about this emerging trend in IT and software engineering. -# What is Platform engineering? +## What is Platform engineering? In a sentence, **platform engineering is the function of designing and building platforms, toolchains, and workflows for internal developer use.** @@ -29,7 +29,7 @@ software more efficiently. Historically, this function has been rolled up within DevOps, with a broad focus on providing automations and workflows that support the maintenance and running of the application. As we'll discuss in more detail below, the changing state of software development requires a new, distinct role whose sole purpose is to support the developer. -# How does Platform Engineering work? +## How does Platform Engineering work? The core product that platform engineers provide to the organization is the Internal Developer Platform (IDP). This integrated product helps developers move quickly throughout the entire software development lifecycle (SDLC). @@ -51,7 +51,7 @@ Some of the key functions of platform engineering teams include: - **Centralizing systems and configurations** to reduce the number of systems that developers and development teams have to access, further increasing the visibility to internal tooling -# How has Platform Engineering developed and evolved? +## How has Platform Engineering developed and evolved? So what's changed? Why is platform engineering on the rise in the industry? Here's a summary of the three major changes that have brought us to this point. @@ -67,35 +67,35 @@ As a result, the SRE and DevOps role is evolving into what we call platform engi Because of the increasing complexity of even basic technologies, these internal tools and resources are absolutely essential. In order to work faster and maintain productivity and efficiency, developers need to have these platforms that enable them to manage this complexity more effectively. -# How does Platform Engineering benefit the organization? +## How does Platform Engineering benefit the organization? Although it provides a number of tangible benefits to developers, platform engineering comes with costs. Those costs require justification. Here are some of the key benefits that development teams and organizations receive when they invest in platform engineering. -## Higher velocity of development +### Higher velocity of development Instead of developers building repeatable functionality from scratch, platform engineers replicate and distribute those base components. Developers can then access those platforms, programs, code templates, and other building blocks and plug them into their existing projects, tweaking when necessary. This increased scalability enables developers to work faster, be more productive, and drive more results for the organization. -## Reduced developer cognitive load +### Reduced developer cognitive load Software development, coding, and programming are creative skills. It takes brainpower not only to build code from scratch, but also edit code, debug, and fix issues when they arise. By building necessary components in advance and providing developers self-serve access, platform engineers reduce their cognitive load. Rather than engaging in rote tasks, developers can devote their brain power to innovation and creative problem solving. -## Improved development culture +### Improved development culture The efficiency of your development team is based not only on _what_ your developers are working on, but _how_ they work. Just like any organization, your culture sets the tone for the entire team. Platform engineers open the door for developers to take ownership over configuration, deployment, and roll-back processes without needing to involve operations. Internal platforms can also increase visibility, improving collaboration within teams. The result: developers are free to be creative without the fear of breaking everything. -## Improved MTTR & Change Failure Rates +### Improved MTTR & Change Failure Rates There is a strong correlation between the use of Internal Developer Platforms and an organization's degree of DevOps evolution. By providing proven, tested platforms for developers to use, platform engineers reduce the risk of bugs cropping up in the system. Additionally, when the occasional problem does arise, platform engineers have the documentation necessary to quickly and easily track down the issue. This is much more difficult to achieve when each developer is building their own code, especially in a distributed microservices architecture. -# Final thoughts on Platform Engineering +## Final thoughts on Platform Engineering As technological capabilities become infinitely more complex, the need for platform engineering is only going to become larger. Developers will no longer be able to do their jobs on their own---they'll need the support and abstraction that comes with platforms tailored to increasing their efficiency and productivity. diff --git a/marketing/content/blog/what-is-test-data.mdx b/marketing/content/blog/what-is-test-data.mdx index e9aa2e0091..a168c063cd 100644 --- a/marketing/content/blog/what-is-test-data.mdx +++ b/marketing/content/blog/what-is-test-data.mdx @@ -8,11 +8,11 @@ authors: - evis --- -# Introduction +## Introduction Test data is one of the most important yet least talked about parts of software engineering. At a certain point in time, every developer has thought that their code was perfect and would function without bugs. Then they wrote some test data and realized that some bugs only appear when you start to put data through the system. Whether it's type mismatches, state management issues, scalability and performance bottlenecks or something else, testing your code with data is the only way to be certain that it actually works. So let's talk a look at what test data is, how developers use it and how to create it. -# What is Test Data? +## What is Test Data? Test data is a set of data that is designed test the functionality, performance, and security of an application. The idea is to simulate, as closely as you can, real-world data that your application is expected to encounter when it's live. There are few different types of test data: @@ -22,11 +22,11 @@ Test data is a set of data that is designed test the functionality, performance, Depending on the feature and test case scenario, one of these types might make more sense than the other. Or, often it's a combination of different types of test data that you need to sufficiently test your application. We'll look at some examples below. -# What is Test Data Management? +## What is Test Data Management? Test Data Management refers to the overall workflow of generating and managing test data. The goal is to ensure that test data is accurate, secure and effectively managed in order to ensure that developers are able to confidently test their applications. For example, say that a team of developers pulls data from a staging database to develop locally. Later, they submit a pull request for their feature only to see that their test are now failing. But it works locally, so what could be the cause? One possible issue is that the test database and the stage database are using two different datasets. In order to verify this, a test data management platform can help to sync data across environments so that the developer can narrow down the root cause and fix it. This versioning and syncing are core features in test data management among other things such as data anonymization, subsetting and validation. -# Why is Test Data important? +## Why is Test Data important? High-quality test data is essential for several reasons: @@ -36,7 +36,7 @@ High-quality test data is essential for several reasons: 4. Confidence in Release: Comprehensive test data coverage fosters increases your confidence that the application works as intended. 5. Creating Demos: most SaaS applications have a demo version that sales reps use to demo to customers. Those applications need test data to show off their features and functionality. Having high quality test data that you can easily reset is a great way to have high fidelity demos. -# How do developers use Test Data? +## How do developers use Test Data? Now that we have a pretty good understanding of what test data is and why it's important, lets take a look at how it's used. Test data is and can be used throughout the entire SDLC to ensure that your application is ready for production use. Here are some ways to use test data: @@ -46,7 +46,7 @@ Now that we have a pretty good understanding of what test data is and why it's i 4. Performance Testing: testing the application's performance under different data loads to identify bottlenecks and optimize performance. 5. Security Testing: testing the application's security with malicious data to detect vulnerabilities and prevent attacks. -# How to create effective Test Data? +## How to create effective Test Data? There are many different ways to create test data depending on what your want to test. As we mentioned above, different scenarios call for different types of test data. For example, static test data is fairly straight to create because it doesn't change often and can easily be reused. On the other hand, production-like is much more difficult to create because you need to think about the security and privacy concerns of the data leaving a secure environment. Here is how to approach creating test data: @@ -62,13 +62,13 @@ There are many different ways to create test data depending on what your want to 4. Manage the data lifecycle. Now that the data has been generated, you'll need to think about versioning, updating and maintaining the data throughout it's lifecycle. -# Creating Test Data with Neosync +## Creating Test Data with Neosync [Neosync](https://www.neosync.dev) is built to create production-like test data that developers can use to test their applications. The best part is that it's all self-serve and developers have complete flexibility over the schema and how the data is generated. Neosync ships with 40+ Transformers out of the box or developers can create their [own using custom javascript](/blog/introducing-custom-transformers). Test data can be created by anonymizing existing data or generating net-new data using Neosync's [synthetic data generation](/blog/neosync-neon-data-gen-job). -# Wrapping up +## Wrapping up Test data, though often overlooked, plays a crucial role in building secure and resilient applications. Whether you're training machine learning models or testing a SaaS application, test data can be the difference between a great user-experience and a not so great one.