kylebarron · kylebarron · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi
@@ -1,6 +1,7 @@
 from typing import Any, Iterable, Literal, Sequence, overload
 
 import numpy as np
+import pandas as pd
 from numpy.typing import NDArray
 
 from .types import (
@@ -1514,6 +1515,22 @@ class Table:
             batches: Sequence of RecordBatch to be converted, all schemas must be equal.
             schema: If not passed, will be inferred from the first RecordBatch. Defaults to None.
 
+        Returns:
+            New Table.
+        """
+    @classmethod
+    def from_pandas(
+        cls,
+        df: pd.DataFrame,
+        *,
+        schema: ArrowSchemaExportable | None = None,
+    ) -> Table:
+        """Construct a Table from a pandas DataFrame
+
+        Args:
+            df: pandas DataFrame to convert to Arrow
+            schema: If not passed, will be inferred.
+
         Returns:
             New Table.
         """

diff --git a/pyo3-arrow/src/interop/mod.rs b/pyo3-arrow/src/interop/mod.rs
@@ -1 +1,2 @@
 pub mod numpy;
+pub(crate) mod pandas;
diff --git a/pyo3-arrow/src/interop/pandas/from_pandas.rs b/pyo3-arrow/src/interop/pandas/from_pandas.rs
@@ -0,0 +1,63 @@
+use arrow_array::RecordBatch;
+use arrow_schema::{Schema, SchemaRef};
+use indexmap::IndexMap;
+use numpy::PyArrayDescr;
+use pyo3::intern;
+use pyo3::prelude::*;
+
+use crate::error::PyArrowResult;
+use crate::PyTable;
+
+enum PandasDtype<'a> {
+    Numpy(&'a PyArrayDescr),
+}
+
+impl<'py> FromPyObject<'py> for PandasDtype<'py> {
+    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
+        Ok(Self::Numpy(ob.extract()?))
+    }
+}
+
+pub fn from_pandas_dataframe(
+    py: Python,
+    df: &Bound<PyAny>,
+    schema: Option<SchemaRef>,
+) -> PyArrowResult<(Vec<RecordBatch>, SchemaRef)> {
+    // If pandas 2.2+ and the Arrow C Stream export works, prefer that.
+    if df.hasattr(intern!(py, "__arrow_c_stream__"))? {
+        if let Ok(table) = df.extract::<PyTable>() {
+            return Ok(table.into_inner());
+        }
+    }
+
+    let schema = if let Some(schema) = schema {
+        schema
+    } else {
+        infer_arrow_schema(py, df)?
+    };
+    let batch = import_batch(py, df, &schema)?;
+    Ok((vec![batch], schema))
+}
+
+fn infer_arrow_schema<'py>(py: Python<'py>, df: &'py Bound<PyAny>) -> PyResult<SchemaRef> {
+    let dtypes = access_dtypes(py, df)?;
+    todo!()
+}
+
+fn import_batch<'py>(
+    py: Python<'py>,
+    df: &'py Bound<PyAny>,
+    schema: &Schema,
+) -> PyResult<RecordBatch> {
+    todo!()
+}
+
+fn access_dtypes<'py>(
+    py: Python<'py>,
+    df: &'py Bound<PyAny>,
+) -> PyResult<IndexMap<String, PandasDtype<'py>>> {
+    let dtypes_dict = df
+        .getattr(intern!(py, "dtypes"))?
+        .call_method0(intern!(py, "to_dict"))?;
+    dtypes_dict.extract()
+}
diff --git a/pyo3-arrow/src/interop/pandas/mod.rs b/pyo3-arrow/src/interop/pandas/mod.rs
@@ -0,0 +1 @@
+pub(crate) mod from_pandas;
diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs
@@ -21,6 +21,7 @@ use crate::ffi::to_schema_pycapsule;
 use crate::input::{
     AnyArray, AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices,
 };
+use crate::interop::pandas::from_pandas::from_pandas_dataframe;
 use crate::schema::display_schema;
 use crate::utils::schema_equals;
 use crate::{PyChunkedArray, PyField, PyRecordBatch, PyRecordBatchReader, PySchema};
@@ -280,6 +281,18 @@ impl PyTable {
         Ok(Self::try_new(batches, schema)?)
     }
 
+    #[classmethod]
+    #[pyo3(signature = (df, *, schema=None))]
+    fn from_pandas(
+        _cls: &Bound<PyType>,
+        py: Python,
+        df: &Bound<PyAny>,
+        schema: Option<PySchema>,
+    ) -> PyArrowResult<Self> {
+        let (batches, schema) = from_pandas_dataframe(py, df, schema.map(|s| s.into_inner()))?;
+        Ok(Self::try_new(batches, schema)?)
+    }
+
     #[classmethod]
     #[pyo3(signature = (mapping, *, schema=None, metadata=None))]
     fn from_pydict(

diff --git a/tests/core/test_table.py b/tests/core/test_table.py
@@ -95,3 +95,18 @@ def test_slice():
     sliced2 = table.slice(1, 2)
     assert sliced2.num_rows == 2
     assert sliced2.chunk_lengths == [1, 1]
+
+
+def test_from_pandas():
+    df = pd.DataFrame(
+        {
+            "int64": np.array([1, 2, 3, 4], dtype=np.int64),
+            "string": ["a", "b", "c", "d"],
+        }
+    )
+    for x in df.dtypes:
+        pass
+    df["a"].array.__array__().dtype
+    np.array(df["a"].array)
+    dtypes = df.dtypes.to_dict()
+    pass