>>> from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
>>> from pyspark.sql import SparkSession
>>> import pyarrow as pa
>>>
>>> class SimpleArrowDataSource(DataSource):
...     @classmethod
...     def name(cls):
...         return "simplearrow"
...
...     def schema(self):
...         return "key int, value string"
...
...     def reader(self, schema: str):
...         return SimpleArrowDataSourceReader(schema, self.options)
>>>
>>> class SimpleArrowDataSourceReader(DataSourceReader):
...     def __init__(self, schema, options):
...         self.schema: str = schema
...         self.options = options
...
...     def read(self, partition):
...         keys = pa.array([1, 2, 3], type=pa.int32())
...         values = pa.array(["one", "two", "three"], type=pa.string())
...         schema = pa.schema([("key", pa.int32()), ("value", pa.string())])
...         batch = pa.RecordBatch.from_arrays([keys, values], schema=schema)
...         yield batch
...
...     def partitions(self):
...         return [InputPartition(0)]
>>>
>>> spark.dataSource.register(SimpleArrowDataSource)
>>> df = spark.read.format("simplearrow").load()
>>> df.show()
+---+-----+
|key|value|
+---+-----+
|  1|  one|
|  2|  two|
|  3|three|
+---+-----+
