python-streamz
diff --git a/‎examples/river_kmeans.ipynb‎
Lines changed: 134 additions & 0 deletions b/‎examples/river_kmeans.ipynb‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎examples/river_kmeans.py‎
Lines changed: 70 additions & 0 deletions b/‎examples/river_kmeans.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎streamz/core.py‎
Lines changed: 0 additions & 83 deletions b/‎streamz/core.py‎
Lines changed: 0 additions & 83 deletions
diff --git a/‎streamz/river.py‎
Lines changed: 62 additions & 0 deletions b/‎streamz/river.py‎
Lines changed: 62 additions & 0 deletions
@@ -0,0 +1,134 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "accbccab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from streamz import Stream\n",
+    "import hvplot.streamz\n",
+    "from streamz.river import RiverTrain\n",
+    "from river import cluster\n",
+    "import holoviews as hv\n",
+    "from panel.pane.holoviews import HoloViews\n",
+    "hv.extension('bokeh')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a2ef27a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = cluster.KMeans(n_clusters=3, sigma=0.1, mu=0.5)\n",
+    "centres = [[random.random(), random.random()] for _ in range(3)]\n",
+    "\n",
+    "def gen(move_chance=0.05):\n",
+    "    centre = int(random.random() * 3)  # 3x faster than random.randint(0, 2)\n",
+    "    if random.random() < move_chance:\n",
+    "        centres[centre][0] += random.random() / 5 - 0.1\n",
+    "        centres[centre][1] += random.random() / 5 - 0.1\n",
+    "    value = {'x': random.random() / 20 + centres[centre][0],\n",
+    "             'y': random.random() / 20 + centres[centre][1]}\n",
+    "    return value\n",
+    "\n",
+    "\n",
+    "def get_clusters(model):\n",
+    "    # return [{\"x\": xcen, \"y\": ycen}, ...] for each centre\n",
+    "    data = [{'x': v['x'], 'y': v['y']} for k, v in model.centers.items()]\n",
+    "    return pd.DataFrame(data, index=range(3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6451048",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = Stream.from_periodic(gen, 0.03)\n",
+    "km = RiverTrain(model, pass_model=True)\n",
+    "s.map(lambda x: (x,)).connect(km)  # learn takes a tuple of (x,[ y[, w]])\n",
+    "ex = pd.DataFrame({'x': [0.5], 'y': [0.5]})\n",
+    "ooo = s.map(lambda x: pd.DataFrame([x])).to_dataframe(example=ex)\n",
+    "out = km.map(get_clusters)\n",
+    "\n",
+    "# start things\n",
+    "s.emit(gen())  # set initial model\n",
+    "for i, (x, y) in enumerate(centres):\n",
+    "    model.centers[i]['x'] = x\n",
+    "    model.centers[i]['y'] = y\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b4de451",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pout = out.to_dataframe(example=ex)\n",
+    "pl = (ooo.hvplot.scatter('x', 'y', color=\"blue\", backlog=50) *\n",
+    "      pout.hvplot.scatter('x', 'y', color=\"red\", backlog=3))\n",
+    "pl.opts(xlim=(-0.2, 1.2), ylim=(-0.2, 1.2), height=600, width=600)\n",
+    "pl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c24d2363",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s.start()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18cfd94e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s.stop()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4537495c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
@@ -0,0 +1,70 @@
+import random
+
+import pandas as pd
+
+from streamz import Stream
+import hvplot.streamz
+from streamz.river import RiverTrain
+from river import cluster
+import holoviews as hv
+from panel.pane.holoviews import HoloViews
+hv.extension('bokeh')
+
+model = cluster.KMeans(n_clusters=3, sigma=0.1, mu=0.5)
+centres = [[random.random(), random.random()] for _ in range(3)]
+count = [0]
+
+def gen(move_chance=0.05):
+    centre = int(random.random() * 3)  # 3x faster than random.randint(0, 2)
+    if random.random() < move_chance:
+        centres[centre][0] += random.random() / 5 - 0.1
+        centres[centre][1] += random.random() / 5 - 0.1
+    value = {'x': random.random() / 20 + centres[centre][0],
+             'y': random.random() / 20 + centres[centre][1]}
+    count[0] += 1
+    return value
+
+
+def get_clusters(model):
+    # return [{"x": xcen, "y": ycen}, ...] for each centre
+    data = [{'x': v['x'], 'y': v['y']} for k, v in model.centers.items()]
+    return pd.DataFrame(data, index=range(3))
+
+
+def main(viz=True):
+    # setup pipes
+    cadance = 0.16 if viz else 0.01
+    s = Stream.from_periodic(gen, cadance)
+    km = RiverTrain(model, pass_model=True)
+    s.map(lambda x: (x,)).connect(km)  # learn takes a tuple of (x,[ y[, w]])
+    ex = pd.DataFrame({'x': [0.5], 'y': [0.5]})
+    ooo = s.map(lambda x: pd.DataFrame([x])).to_dataframe(example=ex)
+    out = km.map(get_clusters)
+
+    # start things
+    s.emit(gen())  # set initial model
+    for i, (x, y) in enumerate(centres):
+        model.centers[i]['x'] = x
+        model.centers[i]['y'] = y
+
+    print("starting")
+    s.start()
+
+    if viz:
+        # plot
+        pout = out.to_dataframe(example=ex)
+        pl = (ooo.hvplot.scatter('x', 'y', color="blue", backlog=50) *
+              pout.hvplot.scatter('x', 'y', color="red", backlog=3))
+        pl.opts(xlim=(-0.2, 1.2), ylim=(-0.2, 1.2), height=600, width=600)
+        pan = HoloViews(pl)
+        pan.show()
+    else:
+        import time
+        time.sleep(5)
+        print(count, "events")
+        print("Current centres", centres)
+        print("Output centres", [list(c.values()) for c in model.centers.values()])
+    s.stop()
+
+if __name__ == "__main__":
+    main(viz=True)
@@ -1902,89 +1902,6 @@ def cb(self):
             yield self._emit(x, self.next_metadata)
 
 
-@Stream.register_api()
-class to_kafka(Stream):
-    """ Writes data in the stream to Kafka
-
-    This stream accepts a string or bytes object. Call ``flush`` to ensure all
-    messages are pushed. Responses from Kafka are pushed downstream.
-
-    Parameters
-    ----------
-    topic : string
-        The topic which to write
-    producer_config : dict
-        Settings to set up the stream, see
-        https://docs.confluent.io/current/clients/confluent-kafka-python/#configuration
-        https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
-        Examples:
-        bootstrap.servers: Connection string (host:port) to Kafka
-
-    Examples
-    --------
-    >>> from streamz import Stream
-    >>> ARGS = {'bootstrap.servers': 'localhost:9092'}
-    >>> source = Stream()
-    >>> kafka = source.map(lambda x: str(x)).to_kafka('test', ARGS)
-    <to_kafka>
-    >>> for i in range(10):
-    ...     source.emit(i)
-    >>> kafka.flush()
-    """
-    def __init__(self, upstream, topic, producer_config, **kwargs):
-        import confluent_kafka as ck
-
-        self.topic = topic
-        self.producer = ck.Producer(producer_config)
-
-        kwargs["ensure_io_loop"] = True
-        Stream.__init__(self, upstream, **kwargs)
-        self.stopped = False
-        self.polltime = 0.2
-        self.loop.add_callback(self.poll)
-        self.futures = []
-
-    @gen.coroutine
-    def poll(self):
-        while not self.stopped:
-            # executes callbacks for any delivered data, in this thread
-            # if no messages were sent, nothing happens
-            self.producer.poll(0)
-            yield gen.sleep(self.polltime)
-
-    def update(self, x, who=None, metadata=None):
-        future = gen.Future()
-        self.futures.append(future)
-
-        @gen.coroutine
-        def _():
-            while True:
-                try:
-                    # this runs asynchronously, in C-K's thread
-                    self.producer.produce(self.topic, x, callback=self.cb)
-                    return
-                except BufferError:
-                    yield gen.sleep(self.polltime)
-                except Exception as e:
-                    future.set_exception(e)
-                    return
-
-        self.loop.add_callback(_)
-        return future
-
-    @gen.coroutine
-    def cb(self, err, msg):
-        future = self.futures.pop(0)
-        if msg is not None and msg.value() is not None:
-            future.set_result(None)
-            yield self._emit(msg.value())
-        else:
-            future.set_exception(err or msg.error())
-
-    def flush(self, timeout=-1):
-        self.producer.flush(timeout)
-
-
 def sync(loop, func, *args, **kwargs):
     """
     Run coroutine in loop running in separate thread.
 
@@ -0,0 +1,62 @@
+from . import Stream
+
+
+# TODO: most river classes support batches, e.g., learn_many, more efficiently
+
+
+class RiverTransform(Stream):
+    """Pass data through one or more River transforms"""
+
+    def __init__(self, model, **kwargs):
+        super().__init__(**kwargs)
+        self.model = model
+
+    def update(self, x, who=None, metadata=None):
+        out = self.model.transform_one(*x)
+        self.emit(out)
+
+
+class RiverTrain(Stream):
+
+    def __init__(self, model, metric=None, pass_model=False, **kwargs):
+        """
+
+        If metric and pass_model are both defaults, this is effectively
+        a sink.
+
+        :param model: river model or pipeline
+        :param metric: river metric
+            If given, it is emitted on every sample
+        :param pass_model: bool
+            If True, the (updated) model if emitted for each sample
+        """
+        super().__init__(**kwargs)
+        self.model = model
+        if pass_model and metric is not None:
+            raise TypeError
+        self.pass_model = pass_model
+        self.metric = metric
+
+    def update(self, x, who=None, metadata=None):
+        """
+        :param x: tuple
+            (x, [y[, w]) floats for single sample. Include
+        """
+        self.model.learn_one(*x)
+        if self.metric:
+            yp = self.model.predict_one(x[0])
+            weights = x[2] if len(x) > 1 else 1.0
+            self.emit(self.metric.update(x[1], yp, weights).get(), metadata=metadata)
+        if self.pass_model:
+            self.emit(self.model, metadata=metadata)
+
+
+class RiverPredict(Stream):
+
+    def __init__(self, model, **kwargs):
+        super().__init__(**kwargs)
+        self.model = model
+
+    def update(self, x, who=None, metadata=None):
+        out = self.model.predict_one(x)
+        self.emit(out, metadata=metadata)