kafka-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From guozh...@apache.org
Subject [15/15] kafka-site git commit: MINOR: Improve Streams Dev Guide
Date Wed, 20 Dec 2017 21:23:31 GMT
MINOR: Improve Streams Dev Guide

See related https://github.com/apache/kafka/pull/4252

Author: Joel Hamill <joel-hamill@users.noreply.github.com>

Reviewers: Derrick Or <derrickor@gmail.com>, Guozhang Wang <wangguoz@gmail.com>

Closes #112 from joel-hamill/add-update-dev-guide


Project: http://git-wip-us.apache.org/repos/asf/kafka-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/kafka-site/commit/12dbff55
Tree: http://git-wip-us.apache.org/repos/asf/kafka-site/tree/12dbff55
Diff: http://git-wip-us.apache.org/repos/asf/kafka-site/diff/12dbff55

Branch: refs/heads/asf-site
Commit: 12dbff55da8532d442446981aaff01ade3c1d4e5
Parents: 6752bf4
Author: Joel Hamill <joel-hamill@users.noreply.github.com>
Authored: Wed Dec 20 13:22:06 2017 -0800
Committer: Guozhang Wang <wangguoz@gmail.com>
Committed: Wed Dec 20 13:22:06 2017 -0800

----------------------------------------------------------------------
 .gitignore                                      |    1 +
 10/architecture.html                            |  160 +
 10/core-concepts.html                           |  213 ++
 10/developer-guide/app-reset-tool.html          |  173 +
 10/developer-guide/config-streams.html          |  717 ++++
 10/developer-guide/datatypes.html               |  223 ++
 10/developer-guide/dsl-api.html                 | 3208 ++++++++++++++++++
 10/developer-guide/index.html                   |  102 +
 10/developer-guide/interactive-queries.html     |  530 +++
 10/developer-guide/manage-topics.html           |  129 +
 10/developer-guide/memory-mgmt.html             |  241 ++
 10/developer-guide/processor-api.html           |  437 +++
 10/developer-guide/running-app.html             |  197 ++
 10/developer-guide/security.html                |  176 +
 10/developer-guide/write-streams.html           |  198 ++
 10/documentation/streams/architecture.html      |    2 +-
 10/documentation/streams/core-concepts.html     |    2 +-
 10/documentation/streams/developer-guide.html   |   19 -
 .../streams/developer-guide/app-reset-tool.html |   19 +
 .../streams/developer-guide/config-streams.html |   19 +
 .../streams/developer-guide/datatypes.html      |   19 +
 .../streams/developer-guide/dsl-api.html        |   19 +
 .../streams/developer-guide/index.html          |   19 +
 .../developer-guide/interactive-queries.html    |   19 +
 .../streams/developer-guide/manage-topics.html  |   19 +
 .../streams/developer-guide/memory-mgmt.html    |   19 +
 .../streams/developer-guide/processor-api.html  |   19 +
 .../streams/developer-guide/running-app.html    |   19 +
 .../streams/developer-guide/security.html       |   19 +
 .../streams/developer-guide/write-streams.html  |   19 +
 10/documentation/streams/index.html             |    2 +-
 10/documentation/streams/quickstart.html        |    2 +-
 10/documentation/streams/tutorial.html          |    2 +-
 10/documentation/streams/upgrade-guide.html     |    2 +-
 10/images/streams-elastic-scaling-1.png         |  Bin 0 -> 88673 bytes
 10/images/streams-elastic-scaling-2.png         |  Bin 0 -> 91141 bytes
 10/images/streams-elastic-scaling-3.png         |  Bin 0 -> 88604 bytes
 10/images/streams-session-windows-01.png        |  Bin 0 -> 49003 bytes
 10/images/streams-session-windows-02.png        |  Bin 0 -> 55956 bytes
 10/images/streams-time-windows-hopping.png      |  Bin 0 -> 110392 bytes
 10/images/streams-time-windows-tumbling.png     |  Bin 0 -> 63888 bytes
 10/index.html                                   |  367 ++
 10/quickstart.html                              |  459 ++-
 10/streams/architecture.html                    |    4 +-
 10/streams/core-concepts.html                   |   33 +-
 10/streams/developer-guide.html                 | 3026 -----------------
 10/streams/developer-guide/app-reset-tool.html  |  173 +
 10/streams/developer-guide/config-streams.html  |  717 ++++
 10/streams/developer-guide/datatypes.html       |  223 ++
 10/streams/developer-guide/dsl-api.html         | 3208 ++++++++++++++++++
 10/streams/developer-guide/index.html           |  102 +
 .../developer-guide/interactive-queries.html    |  530 +++
 10/streams/developer-guide/manage-topics.html   |  129 +
 10/streams/developer-guide/memory-mgmt.html     |  241 ++
 10/streams/developer-guide/processor-api.html   |  437 +++
 10/streams/developer-guide/running-app.html     |  197 ++
 10/streams/developer-guide/security.html        |  176 +
 10/streams/developer-guide/write-streams.html   |  198 ++
 10/streams/index.html                           |  331 +-
 10/streams/quickstart.html                      |   39 +-
 10/streams/tutorial.html                        |   33 +-
 10/streams/upgrade-guide.html                   |    2 +-
 10/tutorial.html                                |  663 ++++
 10/upgrade-guide.html                           |  390 +++
 documentation/streams/developer-guide.html      |    2 -
 .../streams/developer-guide/app-reset-tool.html |    2 +
 .../streams/developer-guide/config-streams.html |    2 +
 .../streams/developer-guide/datatypes.html      |    2 +
 .../streams/developer-guide/dsl-api.html        |    2 +
 .../streams/developer-guide/index.html          |    2 +
 .../developer-guide/interactive-queries.html    |    2 +
 .../streams/developer-guide/manage-topics.html  |    2 +
 .../streams/developer-guide/memory-mgmt.html    |    2 +
 .../streams/developer-guide/processor-api.html  |    2 +
 .../streams/developer-guide/running-app.html    |    2 +
 .../streams/developer-guide/security.html       |    2 +
 .../streams/developer-guide/write-streams.html  |    2 +
 77 files changed, 15210 insertions(+), 3458 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kafka-site/blob/12dbff55/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e43b0f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.DS_Store

http://git-wip-us.apache.org/repos/asf/kafka-site/blob/12dbff55/10/architecture.html
----------------------------------------------------------------------
diff --git a/10/architecture.html b/10/architecture.html
new file mode 100644
index 0000000..efc01bd
--- /dev/null
+++ b/10/architecture.html
@@ -0,0 +1,160 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script><!--#include virtual="../js/templateData.js" --></script>
+
+<script id="content-template" type="text/x-handlebars-template">
+    <h1>Architecture</h1>
+
+    Kafka Streams simplifies application development by building on the Kafka producer and consumer libraries and leveraging the native capabilities of
+    Kafka to offer data parallelism, distributed coordination, fault tolerance, and operational simplicity. In this section, we describe how Kafka Streams works underneath the covers.
+
+    <p>
+        The picture below shows the anatomy of an application that uses the Kafka Streams library. Let's walk through some details.
+    </p>
+    <img class="centered" src="/{{version}}/images/streams-architecture-overview.jpg" style="width:750px">
+
+    <h3><a id="streams_architecture_tasks" href="#streams_architecture_tasks">Stream Partitions and Tasks</a></h3>
+
+    <p>
+        The messaging layer of Kafka partitions data for storing and transporting it. Kafka Streams partitions data for processing it.
+        In both cases, this partitioning is what enables data locality, elasticity, scalability, high performance, and fault tolerance.
+        Kafka Streams uses the concepts of <b>partitions</b> and <b>tasks</b> as logical units of its parallelism model based on Kafka topic partitions.
+        There are close links between Kafka Streams and Kafka in the context of parallelism:
+    </p>
+
+    <ul>
+        <li>Each <b>stream partition</b> is a totally ordered sequence of data records and maps to a Kafka <b>topic partition</b>.</li>
+        <li>A <b>data record</b> in the stream maps to a Kafka <b>message</b> from that topic.</li>
+        <li>The <b>keys</b> of data records determine the partitioning of data in both Kafka and Kafka Streams, i.e., how data is routed to specific partitions within topics.</li>
+    </ul>
+
+    <p>
+        An application's processor topology is scaled by breaking it into multiple tasks.
+        More specifically, Kafka Streams creates a fixed number of tasks based on the input stream partitions for the application,
+        with each task assigned a list of partitions from the input streams (i.e., Kafka topics). The assignment of partitions to tasks
+        never changes so that each task is a fixed unit of parallelism of the application. Tasks can then instantiate their own processor topology
+        based on the assigned partitions; they also maintain a buffer for each of its assigned partitions and process messages one-at-a-time from
+        these record buffers. As a result stream tasks can be processed independently and in parallel without manual intervention.
+    </p>
+
+    <p>
+        It is important to understand that Kafka Streams is not a resource manager, but a library that "runs" anywhere its stream processing application runs.
+        Multiple instances of the application are executed either on the same machine, or spread across multiple machines and tasks can be distributed automatically
+        by the library to those running application instances. The assignment of partitions to tasks never changes; if an application instance fails, all its assigned
+        tasks will be automatically restarted on other instances and continue to consume from the same stream partitions.
+    </p>
+
+    <p>
+        The following diagram shows two tasks each assigned with one partition of the input streams.
+    </p>
+    <img class="centered" src="/{{version}}/images/streams-architecture-tasks.jpg" style="width:400px">
+    <br>
+
+    <h3><a id="streams_architecture_threads" href="#streams_architecture_threads">Threading Model</a></h3>
+
+    <p>
+        Kafka Streams allows the user to configure the number of <b>threads</b> that the library can use to parallelize processing within an application instance.
+        Each thread can execute one or more tasks with their processor topologies independently. For example, the following diagram shows one stream thread running two stream tasks.
+    </p>
+    <img class="centered" src="/{{version}}/images/streams-architecture-threads.jpg" style="width:400px">
+
+    <p>
+        Starting more stream threads or more instances of the application merely amounts to replicating the topology and having it process a different subset of Kafka partitions, effectively parallelizing processing.
+        It is worth noting that there is no shared state amongst the threads, so no inter-thread coordination is necessary. This makes it very simple to run topologies in parallel across the application instances and threads.
+        The assignment of Kafka topic partitions amongst the various stream threads is transparently handled by Kafka Streams leveraging <a href="https://cwiki.apache.org/confluence/display/KAFKA/Kafka+Client-side+Assignment+Proposal">Kafka's coordination</a> functionality.
+    </p>
+
+    <p>
+        As we described above, scaling your stream processing application with Kafka Streams is easy: you merely need to start additional instances of your application,
+        and Kafka Streams takes care of distributing partitions amongst tasks that run in the application instances. You can start as many threads of the application
+        as there are input Kafka topic partitions so that, across all running instances of an application, every thread (or rather, the tasks it runs) has at least one input partition to process.
+    </p>
+    <br>
+
+    <h3><a id="streams_architecture_state" href="#streams_architecture_state">Local State Stores</a></h3>
+
+    <p>
+        Kafka Streams provides so-called <b>state stores</b>, which can be used by stream processing applications to store and query data,
+        which is an important capability when implementing stateful operations. The <a href="/{{version}}/documentation/streams/developer-guide#streams_dsl">Kafka Streams DSL</a>, for example, automatically creates
+        and manages such state stores when you are calling stateful operators such as <code>join()</code> or <code>aggregate()</code>, or when you are windowing a stream.
+    </p>
+
+    <p>
+        Every stream task in a Kafka Streams application may embed one or more local state stores that can be accessed via APIs to store and query data required for processing.
+        Kafka Streams offers fault-tolerance and automatic recovery for such local state stores.
+    </p>
+
+    <p>
+        The following diagram shows two stream tasks with their dedicated local state stores.
+    </p>
+    <img class="centered" src="/{{version}}/images/streams-architecture-states.jpg" style="width:400px">
+    <br>
+
+    <h3><a id="streams_architecture_recovery" href="#streams_architecture_recovery">Fault Tolerance</a></h3>
+
+    <p>
+        Kafka Streams builds on fault-tolerance capabilities integrated natively within Kafka. Kafka partitions are highly available and replicated; so when stream data is persisted to Kafka it is available
+        even if the application fails and needs to re-process it. Tasks in Kafka Streams leverage the fault-tolerance capability
+        offered by the Kafka consumer client to handle failures.
+        If a task runs on a machine that fails, Kafka Streams automatically restarts the task in one of the remaining running instances of the application.
+    </p>
+
+    <p>
+        In addition, Kafka Streams makes sure that the local state stores are robust to failures, too. For each state store, it maintains a replicated changelog Kafka topic in which it tracks any state updates.
+        These changelog topics are partitioned as well so that each local state store instance, and hence the task accessing the store, has its own dedicated changelog topic partition.
+        <a href="/{{version}}/documentation/#compaction">Log compaction</a> is enabled on the changelog topics so that old data can be purged safely to prevent the topics from growing indefinitely.
+        If tasks run on a machine that fails and are restarted on another machine, Kafka Streams guarantees to restore their associated state stores to the content before the failure by
+        replaying the corresponding changelog topics prior to resuming the processing on the newly started tasks. As a result, failure handling is completely transparent to the end user.
+    </p>
+
+    <p>
+        Note that the cost of task (re)initialization typically depends primarily on the time for restoring the state by replaying the state stores' associated changelog topics.
+        To minimize this restoration time, users can configure their applications to have <b>standby replicas</b> of local states (i.e. fully replicated copies of the state).
+        When a task migration happens, Kafka Streams then attempts to assign a task to an application instance where such a standby replica already exists in order to minimize
+        the task (re)initialization cost. See <code>num.standby.replicas</code> in the <a href="/{{version}}/documentation/#streamsconfigs"><b>Kafka Streams Configs</b></a> section.
+    </p>
+
+    <div class="pagination">
+        <a href="/{{version}}/documentation/streams/core-concepts" class="pagination__btn pagination__btn__prev">Previous</a>
+        <a href="/{{version}}/documentation/streams/upgrade-guide" class="pagination__btn pagination__btn__next">Next</a>
+    </div>
+</script>
+
+<!--#include virtual="../../includes/_header.htm" -->
+<!--#include virtual="../../includes/_top.htm" -->
+<div class="content documentation documentation--current">
+    <!--#include virtual="../../includes/_nav.htm" -->
+    <div class="right">
+        <!--#include virtual="../../includes/_docs_banner.htm" -->
+        <ul class="breadcrumbs">
+            <li><a href="/documentation">Documentation</a></li>
+            <li><a href="/documentation/streams">Kafka Streams</a></li>
+        </ul>
+        <div class="p-content"></div>
+    </div>
+</div>
+<!--#include virtual="../../includes/_footer.htm" -->
+<script>
+$(function() {
+  // Show selected style on nav item
+  $('.b-nav__streams').addClass('selected');
+
+  // Display docs subnav items
+  $('.b-nav__docs').parent().toggleClass('nav__item__with__subs--expanded');
+});
+</script>

http://git-wip-us.apache.org/repos/asf/kafka-site/blob/12dbff55/10/core-concepts.html
----------------------------------------------------------------------
diff --git a/10/core-concepts.html b/10/core-concepts.html
new file mode 100644
index 0000000..1675c1f
--- /dev/null
+++ b/10/core-concepts.html
@@ -0,0 +1,213 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script><!--#include virtual="../js/templateData.js" --></script>
+
+<script id="content-template" type="text/x-handlebars-template">
+    <h1>Core Concepts</h1>
+    <div class="sub-nav-sticky">
+      <div class="sticky-top">
+        <div style="height:35px">
+          <a href="/{{version}}/documentation/streams/">Introduction</a>
+          <a href="/{{version}}/documentation/streams/developer-guide/">Developer Guide</a>
+          <a class="active-menu-item" href="/{{version}}/documentation/streams/core-concepts">Concepts</a>
+          <a href="/{{version}}/documentation/streams/quickstart">Run Demo App</a>
+          <a href="/{{version}}/documentation/streams/tutorial">Tutorial: Write App</a>
+        </div>
+      </div>
+  </div> 
+    <p>
+        Kafka Streams is a client library for processing and analyzing data stored in Kafka.
+        It builds upon important stream processing concepts such as properly distinguishing between event time and processing time, windowing support, and simple yet efficient management and real-time querying of application state.
+    </p>
+    <p>
+        Kafka Streams has a <b>low barrier to entry</b>: You can quickly write and run a small-scale proof-of-concept on a single machine; and you only need to run additional instances of your application on multiple machines to scale up to high-volume production workloads.
+        Kafka Streams transparently handles the load balancing of multiple instances of the same application by leveraging Kafka's parallelism model.
+    </p>
+    <p>
+        Some highlights of Kafka Streams:
+    </p>
+
+    <ul>
+        <li>Designed as a <b>simple and lightweight client library</b>, which can be easily embedded in any Java application and integrated with any existing packaging, deployment and operational tools that users have for their streaming applications.</li>
+        <li>Has <b>no external dependencies on systems other than Apache Kafka itself</b> as the internal messaging layer; notably, it uses Kafka's partitioning model to horizontally scale processing while maintaining strong ordering guarantees.</li>
+        <li>Supports <b>fault-tolerant local state</b>, which enables very fast and efficient stateful operations like windowed joins and aggregations.</li>
+        <li>Supports <b>exactly-once</b> processing semantics to guarantee that each record will be processed once and only once even when there is a failure on either Streams clients or Kafka brokers in the middle of processing.</li>
+        <li>Employs <b>one-record-at-a-time processing</b> to achieve millisecond processing latency, and supports <b>event-time based windowing operations</b> with late arrival of records.</li>
+        <li>Offers necessary stream processing primitives, along with a <b>high-level Streams DSL</b> and a <b>low-level Processor API</b>.</li>
+
+    </ul>
+
+    <p>
+        We first summarize the key concepts of Kafka Streams.
+    </p>
+
+    <h3><a id="streams_topology" href="#streams_topology">Stream Processing Topology</a></h3>
+
+    <ul>
+        <li>A <b>stream</b> is the most important abstraction provided by Kafka Streams: it represents an unbounded, continuously updating data set. A stream is an ordered, replayable, and fault-tolerant sequence of immutable data records, where a <b>data record</b> is defined as a key-value pair.</li>
+        <li>A <b>stream processing application</b> is any program that makes use of the Kafka Streams library. It defines its computational logic through one or more <b>processor topologies</b>, where a processor topology is a graph of stream processors (nodes) that are connected by streams (edges).</li>
+        <li>A <b>stream processor</b> is a node in the processor topology; it represents a processing step to transform data in streams by receiving one input record at a time from its upstream processors in the topology, applying its operation to it, and may subsequently produce one or more output records to its downstream processors. </li>
+    </ul>
+
+    There are two special processors in the topology:
+
+    <ul>
+        <li><b>Source Processor</b>: A source processor is a special type of stream processor that does not have any upstream processors. It produces an input stream to its topology from one or multiple Kafka topics by consuming records from these topics and forwarding them to its down-stream processors.</li>
+        <li><b>Sink Processor</b>: A sink processor is a special type of stream processor that does not have down-stream processors. It sends any received records from its up-stream processors to a specified Kafka topic.</li>
+    </ul>
+
+    Note that in normal processor nodes other remote systems can also be accessed while processing the current record. Therefore the processed results can either be streamed back into Kafka or written to an external system.
+
+    <img class="centered" src="/{{version}}/images/streams-architecture-topology.jpg" style="width:400px">
+
+    <p>
+        Kafka Streams offers two ways to define the stream processing topology: the <a href="/{{version}}/documentation/streams/developer-guide#streams_dsl"><b>Kafka Streams DSL</b></a> provides
+        the most common data transformation operations such as <code>map</code>, <code>filter</code>, <code>join</code> and <code>aggregations</code> out of the box; the lower-level <a href="/{{version}}/documentation/streams/developer-guide#streams_processor"><b>Processor API</b></a> allows
+        developers define and connect custom processors as well as to interact with <a href="#streams_state">state stores</a>.
+    </p>
+
+    <p>
+        A processor topology is merely a logical abstraction for your stream processing code.
+        At runtime, the logical topology is instantiated and replicated inside the application for parallel processing (see <a href="#streams_architecture_tasks"><b>Stream Partitions and Tasks</b></a> for details).
+    </p>
+
+    <h3><a id="streams_time" href="#streams_time">Time</a></h3>
+
+    <p>
+        A critical aspect in stream processing is the notion of <b>time</b>, and how it is modeled and integrated.
+        For example, some operations such as <b>windowing</b> are defined based on time boundaries.
+    </p>
+    <p>
+        Common notions of time in streams are:
+    </p>
+
+    <ul>
+        <li><b>Event time</b> - The point in time when an event or data record occurred, i.e. was originally created "at the source". <b>Example:</b> If the event is a geo-location change reported by a GPS sensor in a car, then the associated event-time would be the time when the GPS sensor captured the location change.</li>
+        <li><b>Processing time</b> - The point in time when the event or data record happens to be processed by the stream processing application, i.e. when the record is being consumed. The processing time may be milliseconds, hours, or days etc. later than the original event time. <b>Example:</b> Imagine an analytics application that reads and processes the geo-location data reported from car sensors to present it to a fleet management dashboard. Here, processing-time in the analytics application might be milliseconds or seconds (e.g. for real-time pipelines based on Apache Kafka and Kafka Streams) or hours (e.g. for batch pipelines based on Apache Hadoop or Apache Spark) after event-time.</li>
+        <li><b>Ingestion time</b> - The point in time when an event or data record is stored in a topic partition by a Kafka broker. The difference to event time is that this ingestion timestamp is generated when the record is appended to the target topic by the Kafka broker, not when the record is created "at the source". The difference to processing time is that processing time is when the stream processing application processes the record. <b>For example,</b> if a record is never processed, there is no notion of processing time for it, but it still has an ingestion time.</li>
+    </ul>
+    <p>
+        The choice between event-time and ingestion-time is actually done through the configuration of Kafka (not Kafka Streams): From Kafka 0.10.x onwards, timestamps are automatically embedded into Kafka messages. Depending on Kafka's configuration these timestamps represent event-time or ingestion-time. The respective Kafka configuration setting can be specified on the broker level or per topic. The default timestamp extractor in Kafka Streams will retrieve these embedded timestamps as-is. Hence, the effective time semantics of your application depend on the effective Kafka configuration for these embedded timestamps.
+    </p>
+    <p>
+        Kafka Streams assigns a <b>timestamp</b> to every data record via the <code>TimestampExtractor</code> interface.
+        These per-record timestamps describe the progress of a stream with regards to time and are leveraged by	time-dependent operations such as window operations.
+        As a result, this time will only advance when a new record arrives at the processor.
+        We call this data-driven time the <b>stream time</b> of the application to differentiate with the <b>wall-clock time</b> when this application is actually executing.
+        Concrete implementations of the <code>TimestampExtractor</code> interface will then provide different semantics to the stream time definition.
+        For example retrieving or computing timestamps based on the actual contents of data records such as an embedded timestamp field to provide event time semantics,
+        and returning the current wall-clock time thereby yield processing time semantics to stream time.
+        Developers can thus enforce different notions of time depending on their business needs.
+    </p>
+
+    <p>
+        Finally, whenever a Kafka Streams application writes records to Kafka, then it will also assign timestamps to these new records. The way the timestamps are assigned depends on the context:
+    </p>
+
+    <ul>
+        <li> When new output records are generated via processing some input record, for example, <code>context.forward()</code> triggered in the <code>process()</code> function call, output record timestamps are inherited from input record timestamps directly.</li>
+        <li> When new output records are generated via periodic functions such as <code>Punctuator#punctuate()</code>, the output record timestamp is defined as the current internal time (obtained through <code>context.timestamp()</code>) of the stream task.</li>
+        <li> For aggregations, the timestamp of a resulting aggregate update record will be that of the latest arrived input record that triggered the update.</li>
+    </ul>
+
+    <h3><a id="streams_state" href="#streams_state">States</a></h3>
+
+    <p>
+        Some stream processing applications don't require state, which means the processing of a message is independent from
+        the processing of all other messages.
+        However, being able to maintain state opens up many possibilities for sophisticated stream processing applications: you
+        can join input streams, or group and aggregate data records. Many such stateful operators are provided by the <a href="/{{version}}/documentation/streams/developer-guide#streams_dsl"><b>Kafka Streams DSL</b></a>.
+    </p>
+    <p>
+        Kafka Streams provides so-called <b>state stores</b>, which can be used by stream processing applications to store and query data.
+        This is an important capability when implementing stateful operations.
+        Every task in Kafka Streams embeds one or more state stores that can be accessed via APIs to store and query data required for processing.
+        These state stores can either be a persistent key-value store, an in-memory hashmap, or another convenient data structure.
+        Kafka Streams offers fault-tolerance and automatic recovery for local state stores.
+    </p>
+    <p>
+        Kafka Streams allows direct read-only queries of the state stores by methods, threads, processes or applications external to the stream processing application that created the state stores. This is provided through a feature called <b>Interactive Queries</b>. All stores are named and Interactive Queries exposes only the read operations of the underlying implementation.
+    </p>
+    <br>
+
+    <h2><a id="streams_processing_guarantee" href="#streams_processing_guarantee">Processing Guarantees</a></h2>
+
+    <p>
+        In stream processing, one of the most frequently asked question is "does my stream processing system guarantee that each record is processed once and only once, even if some failures are encountered in the middle of processing?"
+        Failing to guarantee exactly-once stream processing is a deal-breaker for many applications that cannot tolerate any data-loss or data duplicates, and in that case a batch-oriented framework is usually used in addition
+        to the stream processing pipeline, known as the <a href="http://lambda-architecture.net/">Lambda Architecture</a>.
+        Prior to 0.11.0.0, Kafka only provides at-least-once delivery guarantees and hence any stream processing systems that leverage it as the backend storage could not guarantee end-to-end exactly-once semantics.
+        In fact, even for those stream processing systems that claim to support exactly-once processing, as long as they are reading from / writing to Kafka as the source / sink, their applications cannot actually guarantee that
+        no duplicates will be generated throughout the pipeline.
+
+        Since the 0.11.0.0 release, Kafka has added support to allow its producers to send messages to different topic partitions in a <a href="https://kafka.apache.org/documentation/#semantics">transactional and idempotent manner</a>,
+        and Kafka Streams has hence added the end-to-end exactly-once processing semantics by leveraging these features.
+        More specifically, it guarantees that for any record read from the source Kafka topics, its processing results will be reflected exactly once in the output Kafka topic as well as in the state stores for stateful operations.
+        Note the key difference between Kafka Streams end-to-end exactly-once guarantee with other stream processing frameworks' claimed guarantees is that Kafka Streams tightly integrates with the underlying Kafka storage system and ensure that
+        commits on the input topic offsets, updates on the state stores, and writes to the output topics will be completed atomically instead of treating Kafka as an external system that may have side-effects.
+        To read more details on how this is done inside Kafka Streams, readers are recommended to read <a href="https://cwiki.apache.org/confluence/display/KAFKA/KIP-129%3A+Streams+Exactly-Once+Semantics">KIP-129</a>.
+
+        In order to achieve exactly-once semantics when running Kafka Streams applications, users can simply set the <code>processing.guarantee</code> config value to <b>exactly_once</b> (default value is <b>at_least_once</b>).
+        More details can be found in the <a href="/{{version}}/documentation#streamsconfigs"><b>Kafka Streams Configs</b></a> section.
+    </p>
+
+    <div class="pagination">
+        <a href="/{{version}}/documentation/streams/developer-guide" class="pagination__btn pagination__btn__prev">Previous</a>
+        <a href="/{{version}}/documentation/streams/architecture" class="pagination__btn pagination__btn__next">Next</a>
+    </div>
+</script>
+
+<!--#include virtual="../../includes/_header.htm" -->
+<!--#include virtual="../../includes/_top.htm" -->
+<div class="content documentation documentation--current">
+    <!--#include virtual="../../includes/_nav.htm" -->
+    <div class="right">
+        <!--#include virtual="../../includes/_docs_banner.htm" -->
+        <ul class="breadcrumbs">
+            <li><a href="/documentation">Documentation</a></li>
+            <li><a href="/documentation/streams">Kafka Streams</a></li>
+        </ul>
+        <div class="p-content"></div>
+    </div>
+</div>
+<!--#include virtual="../../includes/_footer.htm" -->
+<script>
+$(function() {
+  // Show selected style on nav item
+  $('.b-nav__streams').addClass('selected');
+
+
+     //sticky secondary nav
+          var $navbar = $(".sub-nav-sticky"),
+               y_pos = $navbar.offset().top,
+               height = $navbar.height();
+       
+           $(window).scroll(function() {
+               var scrollTop = $(window).scrollTop();
+           
+               if (scrollTop > y_pos - height) {
+                   $navbar.addClass("navbar-fixed")
+               } else if (scrollTop <= y_pos) {
+                   $navbar.removeClass("navbar-fixed")
+               }
+           });
+
+  // Display docs subnav items
+  $('.b-nav__docs').parent().toggleClass('nav__item__with__subs--expanded');
+});
+</script>

http://git-wip-us.apache.org/repos/asf/kafka-site/blob/12dbff55/10/developer-guide/app-reset-tool.html
----------------------------------------------------------------------
diff --git a/10/developer-guide/app-reset-tool.html b/10/developer-guide/app-reset-tool.html
new file mode 100644
index 0000000..b8a590c
--- /dev/null
+++ b/10/developer-guide/app-reset-tool.html
@@ -0,0 +1,173 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script><!--#include virtual="../../js/templateData.js" --></script>
+
+<script id="content-template" type="text/x-handlebars-template">
+  <!-- h1>Developer Guide for Kafka Streams API</h1 -->
+  <div class="sub-nav-sticky">
+    <div class="sticky-top">
+      <!-- div style="height:35px">
+        <a href="/{{version}}/documentation/streams/">Introduction</a>
+        <a class="active-menu-item" href="/{{version}}/documentation/streams/developer-guide">Developer Guide</a>
+        <a href="/{{version}}/documentation/streams/core-concepts">Concepts</a>
+        <a href="/{{version}}/documentation/streams/quickstart">Run Demo App</a>
+        <a href="/{{version}}/documentation/streams/tutorial">Tutorial: Write App</a>
+      </div -->
+    </div>
+  </div>
+
+    <div class="section" id="application-reset-tool">
+        <span id="streams-developer-guide-app-reset"></span><h1>Application Reset Tool<a class="headerlink" href="#application-reset-tool" title="Permalink to this headline"></a></h1>
+        <p>You can reset an application and force it to reprocess its data from scratch by using the application reset tool.
+            This can be useful for development and testing, or when fixing bugs.</p>
+        <p>The application reset tool handles the Kafka Streams <a class="reference internal" href="manage-topics.html#streams-developer-guide-topics-user"><span class="std std-ref">user topics</span></a> (input,
+            output, and intermediate topics) and <a class="reference internal" href="manage-topics.html#streams-developer-guide-topics-internal"><span class="std std-ref">internal topics</span></a> differently
+            when resetting the application.</p>
+        <p>Here&#8217;s what the application reset tool does for each topic type:</p>
+        <ul class="simple">
+            <li>Input topics: Reset to the beginning of the topic. This means that it sets the application&#8217;s committed consumer offsets for all partitions to each partition&#8217;s <code class="docutils literal"><span class="pre">earliest</span></code> offset (for consumer group <code class="docutils literal"><span class="pre">application.id</span></code>).</li>
+            <li>Intermediate topics: Skip to the end of the topic, i.e., set the application&#8217;s committed consumer offsets for all partitions to each partition&#8217;s <code class="docutils literal"><span class="pre">logSize</span></code> (for consumer group <code class="docutils literal"><span class="pre">application.id</span></code>).</li>
+            <li>Internal topics: Delete the internal topic (this automatically deletes any committed offsets).</li>
+        </ul>
+        <p>The application reset tool does not:</p>
+        <ul class="simple">
+            <li>Reset output topics of an application. If any output (or intermediate) topics are consumed by downstream
+                applications, it is your responsibility to adjust those downstream applications as appropriate when you reset the
+                upstream application.</li>
+            <li>Reset the local environment of your application instances.  It is your responsibility to delete the local
+                state on any machine on which an application instance was run.  See the instructions in section
+                <a class="reference internal" href="#streams-developer-guide-reset-local-environment"><span class="std std-ref">Step 2: Reset the local environments of your application instances</span></a> on how to do this.</li>
+        </ul>
+        <dl class="docutils">
+            <dt>Prerequisites</dt>
+            <dd><ul class="first last">
+                <li><p class="first">All instances of your application must be stopped. Otherwise, the application may enter an invalid state, crash, or produce incorrect results. You can verify whether the consumer group with ID <code class="docutils literal"><span class="pre">application.id</span></code> is still active by using <code class="docutils literal"><span class="pre">bin/kafka-consumer-groups</span></code>.</p>
+                </li>
+                <li><p class="first">Use this tool with care and double-check its parameters: If you provide wrong parameter values (e.g., typos in <code class="docutils literal"><span class="pre">application.id</span></code>) or specify parameters inconsistently (e.g., specify the wrong input topics for the application), this tool might invalidate the application&#8217;s state or even impact other applications, consumer groups, or your Kafka topics.</p>
+                </li>
+                <li><p class="first">You should manually delete and re-create any intermediate topics before running the application reset tool. This will free up disk space in Kafka brokers.</p>
+                </li>
+                <li><p class="first">You should delete and recreate intermediate topics before running the application reset tool, unless the following applies:</p>
+                    <blockquote>
+                        <div><ul class="simple">
+                            <li>You have external downstream consumers for the application&#8217;s intermediate topics.</li>
+                            <li>You are in a development environment where manually deleting and re-creating intermediate topics is unnecessary.</li>
+                        </ul>
+                        </div></blockquote>
+                </li>
+            </ul>
+            </dd>
+        </dl>
+        <div class="section" id="step-1-run-the-application-reset-tool">
+            <h2>Step 1: Run the application reset tool<a class="headerlink" href="#step-1-run-the-application-reset-tool" title="Permalink to this headline"></a></h2>
+            <p>Invoke the application reset tool from the command line</p>
+            <div class="highlight-bash"><div class="highlight"><pre><span></span>&lt;path-to-kafka&gt;/bin/kafka-streams-application-reset
+</pre></div>
+            </div>
+            <p>The tool accepts the following parameters:</p>
+            <div class="highlight-bash"><div class="highlight"><pre><span></span>Option <span class="o">(</span>* <span class="o">=</span> required<span class="o">)</span>                 Description
+---------------------                 -----------
+* --application-id &lt;String: id&gt;       The Kafka Streams application ID
+                                        <span class="o">(</span>application.id<span class="o">)</span>.
+--bootstrap-servers &lt;String: urls&gt;    Comma-separated list of broker urls with
+                                        format: HOST1:PORT1,HOST2:PORT2
+                                        <span class="o">(</span>default: localhost:9092<span class="o">)</span>
+--config-file &lt;String: file name&gt;     Property file containing configs to be
+                                        passed to admin clients and embedded
+                                        consumer.
+--dry-run                             Display the actions that would be
+                                        performed without executing the reset
+                                        commands.
+--input-topics &lt;String: list&gt;         Comma-separated list of user input
+                                        topics. For these topics, the tool will
+                                        reset the offset to the earliest
+                                        available offset.
+--intermediate-topics &lt;String: list&gt;  Comma-separated list of intermediate user
+                                        topics <span class="o">(</span>topics used in the through<span class="o">()</span>
+                                        method<span class="o">)</span>. For these topics, the tool
+                                        will skip to the end.
+--zookeeper                           Zookeeper option is deprecated by
+                                        bootstrap.servers, as the reset tool
+                                        would no longer access Zookeeper
+                                        directly.
+</pre></div>
+            </div>
+            <p>Parameters can be combined as needed.  For example, if you want to restart an application from an
+                empty internal state, but not reprocess previous data, simply omit the parameters <code class="docutils literal"><span class="pre">--input-topics</span></code> and
+                <code class="docutils literal"><span class="pre">--intermediate-topics</span></code>.</p>
+        </div>
+        <div class="section" id="step-2-reset-the-local-environments-of-your-application-instances">
+            <span id="streams-developer-guide-reset-local-environment"></span><h2>Step 2: Reset the local environments of your application instances<a class="headerlink" href="#step-2-reset-the-local-environments-of-your-application-instances" title="Permalink to this headline"></a></h2>
+            <p>For a complete application reset, you must delete the application&#8217;s local state directory on any machines where the
+                application instance was run. You must do this before restarting an application instance on the same machine.  You can
+                use either of these methods:</p>
+            <ul class="simple">
+                <li>The API method <code class="docutils literal"><span class="pre">KafkaStreams#cleanUp()</span></code> in your application code.</li>
+                <li>Manually delete the corresponding local state directory (default location: <code class="docutils literal"><span class="pre">/var/lib/kafka-streams/&lt;application.id&gt;</span></code>). For more information, see <a class="reference internal" href="../javadocs.html#streams-javadocs"><span class="std std-ref">state.dir</span></a> StreamsConfig class.</li>
+            </ul>
+</div>
+</div>
+
+
+               </div>
+              </div>
+  <div class="pagination">
+    <a href="/{{version}}/documentation/streams/developer-guide/security" class="pagination__btn pagination__btn__prev">Previous</a>
+      <a href="/{{version}}/documentation/streams/developer-guide/" class="pagination__btn pagination__btn__next">Next</a>
+  </div>
+</script>
+
+<!--#include virtual="../../../includes/_header.htm" -->
+<!--#include virtual="../../../includes/_top.htm" -->
+<div class="content documentation documentation--current">
+  <!--#include virtual="../../../includes/_nav.htm" -->
+  <div class="right">
+    <!--#include virtual="../../../includes/_docs_banner.htm" -->
+    <ul class="breadcrumbs">
+      <li><a href="/documentation">Documentation</a></li>
+      <li><a href="/documentation/streams">Kafka Streams</a></li>
+      <li><a href="/documentation/streams/developer-guide/">Developer Guide</a></li>
+    </ul>
+    <div class="p-content"></div>
+  </div>
+</div>
+<!--#include virtual="../../../includes/_footer.htm" -->
+<script>
+    $(function() {
+        // Show selected style on nav item
+        $('.b-nav__streams').addClass('selected');
+
+        //sticky secondary nav
+        var $navbar = $(".sub-nav-sticky"),
+            y_pos = $navbar.offset().top,
+            height = $navbar.height();
+
+        $(window).scroll(function() {
+            var scrollTop = $(window).scrollTop();
+
+            if (scrollTop > y_pos - height) {
+                $navbar.addClass("navbar-fixed")
+            } else if (scrollTop <= y_pos) {
+                $navbar.removeClass("navbar-fixed")
+            }
+        });
+
+        // Display docs subnav items
+        $('.b-nav__docs').parent().toggleClass('nav__item__with__subs--expanded');
+    });
+</script>
\ No newline at end of file


Mime
View raw message