sqoop-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jar...@apache.org
Subject sqoop git commit: SQOOP-2191: Provide an option automatically choose one mapper when neither primary key is defined nor split by column is provided
Date Fri, 13 Mar 2015 23:59:10 GMT
Repository: sqoop
Updated Branches:
  refs/heads/trunk fca329b32 -> 2576ca2f0


SQOOP-2191: Provide an option automatically choose one mapper when neither primary key is defined nor split by column is provided

(Venkat Ranganathan via Jarek Jarcec Cecho)


Project: http://git-wip-us.apache.org/repos/asf/sqoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/sqoop/commit/2576ca2f
Tree: http://git-wip-us.apache.org/repos/asf/sqoop/tree/2576ca2f
Diff: http://git-wip-us.apache.org/repos/asf/sqoop/diff/2576ca2f

Branch: refs/heads/trunk
Commit: 2576ca2f03a7929f7ff1d4b10867fdee382e8856
Parents: fca329b
Author: Jarek Jarcec Cecho <jarcec@apache.org>
Authored: Fri Mar 13 16:58:28 2015 -0700
Committer: Jarek Jarcec Cecho <jarcec@apache.org>
Committed: Fri Mar 13 16:58:28 2015 -0700

----------------------------------------------------------------------
 src/docs/user/SqoopUserGuide.xml                | 1151 ++++++++++++++++++
 src/docs/user/import-all-tables-purpose.txt     |    6 +-
 src/docs/user/import-all-tables.txt             |    6 +-
 src/docs/user/import.txt                        |   14 +-
 src/java/org/apache/sqoop/SqoopOptions.java     |   13 +
 .../org/apache/sqoop/accumulo/AccumuloUtil.java |   24 +-
 .../sqoop/config/ConfigurationHelper.java       |   40 +-
 .../org/apache/sqoop/manager/SqlManager.java    |   11 +-
 .../mapreduce/hcat/SqoopHCatUtilities.java      |   19 +-
 .../org/apache/sqoop/tool/BaseSqoopTool.java    |    6 +
 .../apache/sqoop/tool/ImportAllTablesTool.java  |    6 +
 src/java/org/apache/sqoop/tool/ImportTool.java  |   13 +-
 src/test/com/cloudera/sqoop/TestAvroImport.java |    3 +-
 .../com/cloudera/sqoop/TestBoundaryQuery.java   |    2 +-
 .../com/cloudera/sqoop/TestParquetImport.java   |    2 +
 .../com/cloudera/sqoop/TestSqoopOptions.java    |   21 +
 .../sqoop/testutil/ImportJobTestCase.java       |    2 +-
 .../org/apache/sqoop/TestAutoResetMapper.java   |  191 +++
 .../TestMainframeDatasetInputFormat.java        |    5 +-
 19 files changed, 1481 insertions(+), 54 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/docs/user/SqoopUserGuide.xml
----------------------------------------------------------------------
diff --git a/src/docs/user/SqoopUserGuide.xml b/src/docs/user/SqoopUserGuide.xml
new file mode 100644
index 0000000..a7fd3ff
--- /dev/null
+++ b/src/docs/user/SqoopUserGuide.xml
@@ -0,0 +1,1151 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<?asciidoc-toc?>
+<?asciidoc-numbered?>
+
+<article lang="en">
+<articleinfo>
+    <title>Sqoop User Guide (v1.4.6-SNAPSHOT)</title>
+</articleinfo>
+<screen>  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.</screen>
+<section id="_introduction">
+<title>Introduction</title>
+<simpara>Sqoop is a tool designed to transfer data between Hadoop and
+relational databases or mainframes. You can use Sqoop to import data from a
+relational database management system (RDBMS) such as MySQL or Oracle or a
+mainframe into the Hadoop Distributed File System (HDFS),
+transform the data in Hadoop MapReduce, and then export the data back
+into an RDBMS.</simpara>
+<simpara>Sqoop automates most of this process, relying on the database to
+describe the schema for the data to be imported. Sqoop uses MapReduce
+to import and export the data, which provides parallel operation as
+well as fault tolerance.</simpara>
+<simpara>This document describes how to get started using Sqoop to move data
+between databases and Hadoop or mainframe to Hadoop and provides reference
+information for the operation of the Sqoop command-line tool suite. This
+document is intended for:</simpara>
+<itemizedlist>
+<listitem>
+<simpara>
+System and application programmers
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+System administrators
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+Database administrators
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+Data analysts
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+Data engineers
+</simpara>
+</listitem>
+</itemizedlist>
+</section>
+<section id="_supported_releases">
+<title>Supported Releases</title>
+<simpara>This documentation applies to Sqoop v1.4.6-SNAPSHOT.</simpara>
+</section>
+<section id="_sqoop_releases">
+<title>Sqoop Releases</title>
+<simpara>Sqoop is an open source software product of the Apache Software Foundation.</simpara>
+<simpara>Software development for Sqoop occurs at <ulink url="http://sqoop.apache.org">http://sqoop.apache.org</ulink>
+At that site you can obtain:</simpara>
+<itemizedlist>
+<listitem>
+<simpara>
+New releases of Sqoop as well as its most recent source code
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+An issue tracker
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+A wiki that contains Sqoop documentation
+</simpara>
+</listitem>
+</itemizedlist>
+</section>
+<section id="_prerequisites">
+<title>Prerequisites</title>
+<simpara>The following prerequisite knowledge is required for this product:</simpara>
+<itemizedlist>
+<listitem>
+<simpara>
+Basic computer technology and terminology
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+Familiarity with command-line interfaces such as <literal>bash</literal>
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+Relational database management systems
+</simpara>
+</listitem>
+<listitem>
+<simpara>
+Basic familiarity with the purpose and operation of Hadoop
+</simpara>
+</listitem>
+</itemizedlist>
+<simpara>Before you can use Sqoop, a release of Hadoop must be installed and
+configured. Sqoop is currently supporting 4 major Hadoop releases - 0.20,
+0.23, 1.0 and 2.0.</simpara>
+<simpara>This document assumes you are using a Linux or Linux-like environment.
+If you are using Windows, you may be able to use cygwin to accomplish
+most of the following tasks. If you are using Mac OS X, you should see
+few (if any) compatibility errors. Sqoop is predominantly operated and
+tested on Linux.</simpara>
+</section>
+<section id="_basic_usage">
+<title>Basic Usage</title>
+<simpara>With Sqoop, you can <emphasis>import</emphasis> data from a relational database system or a
+mainframe into HDFS. The input to the import process is either database table
+or mainframe datasets. For databases, Sqoop will read the table row-by-row
+into HDFS.  For mainframe datasets, Sqoop will read records from each mainframe
+dataset into HDFS. The output of this import process is a set of files
+containing a copy of the imported table or datasets.
+The import process is performed in parallel. For this reason, the
+output will be in multiple files. These files may be delimited text
+files (for example, with commas or tabs separating each field), or
+binary Avro or SequenceFiles containing serialized record data.</simpara>
+<simpara>A by-product of the import process is a generated Java class which
+can encapsulate one row of the imported table. This class is used
+during the import process by Sqoop itself. The Java source code for
+this class is also provided to you, for use in subsequent MapReduce
+processing of the data. This class can serialize and deserialize data
+to and from the SequenceFile format. It can also parse the
+delimited-text form of a record. These abilities allow you to quickly
+develop MapReduce applications that use the HDFS-stored records in
+your processing pipeline. You are also free to parse the delimiteds
+record data yourself, using any other tools you prefer.</simpara>
+<simpara>After manipulating the imported records (for example, with MapReduce
+or Hive) you may have a result data set which you can then <emphasis>export</emphasis>
+back to the relational database. Sqoop&#8217;s export process will read
+a set of delimited text files from HDFS in parallel, parse them into
+records, and insert them as new rows in a target database table, for
+consumption by external applications or users.</simpara>
+<simpara>Sqoop includes some other commands which allow you to inspect the
+database you are working with. For example, you can list the available
+database schemas (with the <literal>sqoop-list-databases</literal> tool) and tables
+within a schema (with the <literal>sqoop-list-tables</literal> tool). Sqoop also
+includes a primitive SQL execution shell (the <literal>sqoop-eval</literal> tool).</simpara>
+<simpara>Most aspects of the import, code generation, and export processes can
+be customized. For databases, you can control the specific row range or
+columns imported.  You can specify particular delimiters and escape characters
+for the file-based representation of the data, as well as the file format
+used.  You can also control the class or package names used in
+generated code. Subsequent sections of this document explain how to
+specify these and other arguments to Sqoop.</simpara>
+</section>
+<section id="_sqoop_tools">
+<title>Sqoop Tools</title>
+<simpara>Sqoop is a collection of related tools. To use Sqoop, you specify the
+tool you want to use and the arguments that control the tool.</simpara>
+<simpara>If Sqoop is compiled from its own source, you can run Sqoop without a formal
+installation process by running the <literal>bin/sqoop</literal> program. Users
+of a packaged deployment of Sqoop (such as an RPM shipped with Apache Bigtop)
+will see this program installed as <literal>/usr/bin/sqoop</literal>. The remainder of this
+documentation will refer to this program as <literal>sqoop</literal>. For example:</simpara>
+<screen>$ sqoop tool-name [tool-arguments]</screen>
+<note><simpara>The following examples that begin with a <literal>$</literal> character indicate
+that the commands must be entered at a terminal prompt (such as
+<literal>bash</literal>). The <literal>$</literal> character represents the prompt itself; you should
+not start these commands by typing a <literal>$</literal>. You can also enter commands
+inline in the text of a paragraph; for example, <literal>sqoop help</literal>. These
+examples do not show a <literal>$</literal> prefix, but you should enter them the same
+way.  Don&#8217;t confuse the <literal>$</literal> shell prompt in the examples with the <literal>$</literal>
+that precedes an environment variable name. For example, the string
+literal <literal>$HADOOP_HOME</literal> includes a "<literal>$</literal>".</simpara></note>
+<simpara>Sqoop ships with a help tool. To display a list of all available
+tools, type the following command:</simpara>
+<screen>$ sqoop help
+usage: sqoop COMMAND [ARGS]
+
+Available commands:
+  codegen            Generate code to interact with database records
+  create-hive-table  Import a table definition into Hive
+  eval               Evaluate a SQL statement and display the results
+  export             Export an HDFS directory to a database table
+  help               List available commands
+  import             Import a table from a database to HDFS
+  import-all-tables  Import tables from a database to HDFS
+  import-mainframe   Import mainframe datasets to HDFS
+  list-databases     List available databases on a server
+  list-tables        List available tables in a database
+  version            Display version information
+
+See 'sqoop help COMMAND' for information on a specific command.</screen>
+<simpara>You can display help for a specific tool by entering: <literal>sqoop help
+(tool-name)</literal>; for example, <literal>sqoop help import</literal>.</simpara>
+<simpara>You can also add the <literal>--help</literal> argument to any command: <literal>sqoop import
+--help</literal>.</simpara>
+<section id="_using_command_aliases">
+<title>Using Command Aliases</title>
+<simpara>In addition to typing the <literal>sqoop (toolname)</literal> syntax, you can use alias
+scripts that specify the <literal>sqoop-(toolname)</literal> syntax. For example, the
+scripts <literal>sqoop-import</literal>, <literal>sqoop-export</literal>, etc. each select a specific
+tool.</simpara>
+</section>
+<section id="_controlling_the_hadoop_installation">
+<title>Controlling the Hadoop Installation</title>
+<simpara>You invoke Sqoop through the program launch capability provided by
+Hadoop. The <literal>sqoop</literal> command-line program is a wrapper which runs the
+<literal>bin/hadoop</literal> script shipped with Hadoop. If you have multiple
+installations of Hadoop present on your machine, you can select the
+Hadoop installation by setting the <literal>$HADOOP_COMMON_HOME</literal> and
+<literal>$HADOOP_MAPRED_HOME</literal> environment variables.</simpara>
+<simpara>For example:</simpara>
+<screen>$ HADOOP_COMMON_HOME=/path/to/some/hadoop \
+  HADOOP_MAPRED_HOME=/path/to/some/hadoop-mapreduce \
+  sqoop import --arguments...</screen>
+<simpara>or:</simpara>
+<screen>$ export HADOOP_COMMON_HOME=/some/path/to/hadoop
+$ export HADOOP_MAPRED_HOME=/some/path/to/hadoop-mapreduce
+$ sqoop import --arguments...</screen>
+<simpara>If either of these variables are not set, Sqoop will fall back to
+<literal>$HADOOP_HOME</literal>. If it is not set either, Sqoop will use the default
+installation locations for Apache Bigtop, <literal>/usr/lib/hadoop</literal> and
+<literal>/usr/lib/hadoop-mapreduce</literal>, respectively.</simpara>
+<simpara>The active Hadoop configuration is loaded from <literal>$HADOOP_HOME/conf/</literal>,
+unless the <literal>$HADOOP_CONF_DIR</literal> environment variable is set.</simpara>
+</section>
+<section id="_using_generic_and_specific_arguments">
+<title>Using Generic and Specific Arguments</title>
+<simpara>To control the operation of each Sqoop tool, you use generic and
+specific arguments.</simpara>
+<simpara>For example:</simpara>
+<screen>$ sqoop help import
+usage: sqoop import [GENERIC-ARGS] [TOOL-ARGS]
+
+Common arguments:
+   --connect &lt;jdbc-uri&gt;     Specify JDBC connect string
+   --connect-manager &lt;class-name&gt;     Specify connection manager class to use
+   --driver &lt;class-name&gt;    Manually specify JDBC driver class to use
+   --hadoop-mapred-home &lt;dir&gt;      Override $HADOOP_MAPRED_HOME
+   --help                   Print usage instructions
+   --password-file          Set path for file containing authentication password
+   -P                       Read password from console
+   --password &lt;password&gt;    Set authentication password
+   --username &lt;username&gt;    Set authentication username
+   --verbose                Print more information while working
+   --hadoop-home &lt;dir&gt;     Deprecated. Override $HADOOP_HOME
+
+[...]
+
+Generic Hadoop command-line arguments:
+(must preceed any tool-specific arguments)
+Generic options supported are
+-conf &lt;configuration file&gt;     specify an application configuration file
+-D &lt;property=value&gt;            use value for given property
+-fs &lt;local|namenode:port&gt;      specify a namenode
+-jt &lt;local|jobtracker:port&gt;    specify a job tracker
+-files &lt;comma separated list of files&gt;    specify comma separated files to be copied to the map reduce cluster
+-libjars &lt;comma separated list of jars&gt;    specify comma separated jar files to include in the classpath.
+-archives &lt;comma separated list of archives&gt;    specify comma separated archives to be unarchived on the compute machines.
+
+The general command line syntax is
+bin/hadoop command [genericOptions] [commandOptions]</screen>
+<simpara>You must supply the generic arguments <literal>-conf</literal>, <literal>-D</literal>, and so on after the
+tool name but <emphasis role="strong">before</emphasis> any tool-specific arguments (such as
+<literal>--connect</literal>). Note that generic Hadoop arguments are preceeded by a
+single dash character (<literal>-</literal>), whereas tool-specific arguments start
+with two dashes (<literal>--</literal>), unless they are single character arguments such as <literal>-P</literal>.</simpara>
+<simpara>The <literal>-conf</literal>, <literal>-D</literal>, <literal>-fs</literal> and <literal>-jt</literal> arguments control the configuration
+and Hadoop server settings. For example, the <literal>-D mapred.job.name=&lt;job_name&gt;</literal> can
+be used to set the name of the MR job that Sqoop launches, if not specified,
+the name defaults to the jar name for the job - which is derived from the used
+table name.</simpara>
+<simpara>The <literal>-files</literal>, <literal>-libjars</literal>, and <literal>-archives</literal> arguments are not typically used with
+Sqoop, but they are included as part of Hadoop&#8217;s internal argument-parsing
+system.</simpara>
+</section>
+<section id="_using_options_files_to_pass_arguments">
+<title>Using Options Files to Pass Arguments</title>
+<simpara>When using Sqoop, the command line options that do not change from
+invocation to invocation can be put in an options file for convenience.
+An options file is a text file where each line identifies an option in
+the order that it appears otherwise on the command line. Option files
+allow specifying a single option on multiple lines by using the
+back-slash character at the end of intermediate lines. Also supported
+are comments within option files that begin with the hash character.
+Comments must be specified on a new line and may not be mixed with
+option text. All comments and empty lines are ignored when option
+files are expanded. Unless options appear as quoted strings, any
+leading or trailing spaces are ignored. Quoted strings if used must
+not extend beyond the line on which they are specified.</simpara>
+<simpara>Option files can be specified anywhere in the command line as long as
+the options within them follow the otherwise prescribed rules of
+options ordering. For instance, regardless of where the options are
+loaded from, they must follow the ordering such that generic options
+appear first, tool specific options next, finally followed by options
+that are intended to be passed to child programs.</simpara>
+<simpara>To specify an options file, simply create an options file in a
+convenient location and pass it to the command line via
+<literal>--options-file</literal> argument.</simpara>
+<simpara>Whenever an options file is specified, it is expanded on the
+command line before the tool is invoked. You can specify more than
+one option files within the same invocation if needed.</simpara>
+<simpara>For example, the following Sqoop invocation for import can
+be specified alternatively as shown below:</simpara>
+<screen>$ sqoop import --connect jdbc:mysql://localhost/db --username foo --table TEST
+
+$ sqoop --options-file /users/homer/work/import.txt --table TEST</screen>
+<simpara>where the options file <literal>/users/homer/work/import.txt</literal> contains the following:</simpara>
+<screen>import
+--connect
+jdbc:mysql://localhost/db
+--username
+foo</screen>
+<simpara>The options file can have empty lines and comments for readability purposes.
+So the above example would work exactly the same if the options file
+<literal>/users/homer/work/import.txt</literal> contained the following:</simpara>
+<screen>#
+# Options file for Sqoop import
+#
+
+# Specifies the tool being invoked
+import
+
+# Connect parameter and value
+--connect
+jdbc:mysql://localhost/db
+
+# Username parameter and value
+--username
+foo
+
+#
+# Remaining options should be specified in the command line.
+#</screen>
+</section>
+<section id="_using_tools">
+<title>Using Tools</title>
+<simpara>The following sections will describe each tool&#8217;s operation. The
+tools are listed in the most likely order you will find them useful.</simpara>
+</section>
+</section>
+<section id="_literal_sqoop_import_literal">
+<title><literal>sqoop-import</literal></title>
+<section id="_purpose">
+<title>Purpose</title>
+<simpara>The <literal>import</literal> tool imports an individual table from an RDBMS to HDFS.
+Each row from a table is represented as a separate record in HDFS.
+Records can be stored as text files (one record per line), or in
+binary representation as Avro or SequenceFiles.</simpara>
+</section>
+<section id="_syntax">
+<title>Syntax</title>
+<screen>$ sqoop import (generic-args) (import-args)
+$ sqoop-import (generic-args) (import-args)</screen>
+<simpara>While the Hadoop generic arguments must precede any import arguments,
+you can type the import arguments in any order with respect to one
+another.</simpara>
+<note><simpara>In this document, arguments are grouped into collections
+organized by function. Some collections are present in several tools
+(for example, the "common" arguments). An extended description of their
+functionality is given only on the first presentation in this
+document.</simpara></note>
+<table pgwide="0"
+frame="topbot"
+rowsep="1" colsep="1"
+>
+<title>Common arguments</title>
+<tgroup cols="2">
+<colspec colwidth="248*" align="left"/>
+<colspec colwidth="230*" align="left"/>
+<thead>
+  <row>
+    <entry>
+    Argument
+    </entry>
+    <entry>
+    Description
+    </entry>
+  </row>
+</thead>
+<tbody>
+  <row>
+    <entry>
+    <literal>--connect &lt;jdbc-uri&gt;</literal>
+    </entry>
+    <entry>
+    Specify JDBC connect string
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--connection-manager &lt;class-name&gt;</literal>
+    </entry>
+    <entry>
+    Specify connection manager class to                                          use
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--driver &lt;class-name&gt;</literal>
+    </entry>
+    <entry>
+    Manually specify JDBC driver class                                          to use
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--hadoop-mapred-home &lt;dir&gt;</literal>
+    </entry>
+    <entry>
+    Override $HADOOP_MAPRED_HOME
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--help</literal>
+    </entry>
+    <entry>
+    Print usage instructions
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--password-file</literal>
+    </entry>
+    <entry>
+    Set path for a file containing the                                          authentication password
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>-P</literal>
+    </entry>
+    <entry>
+    Read password from console
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--password &lt;password&gt;</literal>
+    </entry>
+    <entry>
+    Set authentication password
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--username &lt;username&gt;</literal>
+    </entry>
+    <entry>
+    Set authentication username
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--verbose</literal>
+    </entry>
+    <entry>
+    Print more information while working
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--connection-param-file &lt;filename&gt;</literal>
+    </entry>
+    <entry>
+    Optional properties file that                                          provides connection parameters
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--relaxed-isolation</literal>
+    </entry>
+    <entry>
+    Set connection transaction isolation                                          to read uncommitted for the mappers.
+    </entry>
+  </row>
+</tbody>
+</tgroup>
+</table>
+<section id="_connecting_to_a_database_server">
+<title>Connecting to a Database Server</title>
+<simpara>Sqoop is designed to import tables from a database into HDFS. To do
+so, you must specify a <emphasis>connect string</emphasis> that describes how to connect to the
+database. The <emphasis>connect string</emphasis> is similar to a URL, and is communicated to
+Sqoop with the <literal>--connect</literal> argument. This describes the server and
+database to connect to; it may also specify the port. For example:</simpara>
+<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees</screen>
+<simpara>This string will connect to a MySQL database named <literal>employees</literal> on the
+host <literal>database.example.com</literal>. It&#8217;s important that you <emphasis role="strong">do not</emphasis> use the URL
+<literal>localhost</literal> if you intend to use Sqoop with a distributed Hadoop
+cluster. The connect string you supply will be used on TaskTracker nodes
+throughout your MapReduce cluster; if you specify the
+literal name <literal>localhost</literal>, each node will connect to a different
+database (or more likely, no database at all). Instead, you should use
+the full hostname or IP address of the database host that can be seen
+by all your remote nodes.</simpara>
+<simpara>You might need to authenticate against the database before you can
+access it. You can use the <literal>--username</literal> to supply a username to the database.
+Sqoop provides couple of different ways to supply a password,
+secure and non-secure, to the database which is detailed below.</simpara>
+<formalpara><title>Secure way of supplying password to the database</title><para>You should save the password in a file on the users home directory with 400
+permissions and specify the path to that file using the <emphasis role="strong"><literal>--password-file</literal></emphasis>
+argument, and is the preferred method of entering credentials. Sqoop will
+then read the password from the file and pass it to the MapReduce cluster
+using secure means with out exposing the password in the job configuration.
+The file containing the password can either be on the Local FS or HDFS.
+For example:</para></formalpara>
+<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
+    --username venkatesh --password-file ${user.home}/.password</screen>
+<warning><simpara>Sqoop will read entire content of the password file and use it as
+a password. This will include any trailing white space characters such as
+new line characters that are added by default by most of the text editors.
+You need to make sure that your password file contains only characters
+that belongs to your password. On the command line you can use command
+<literal>echo</literal> with switch <literal>-n</literal> to store password without any trailing white space
+characters. For example to store password <literal>secret</literal> you would call
+<literal>echo -n "secret" &gt; password.file</literal>.</simpara></warning>
+<simpara>Another way of supplying passwords is using the <literal>-P</literal> argument which will
+read a password from a console prompt.</simpara>
+<formalpara><title>Protecting password from preying eyes</title><para>Hadoop 2.6.0 provides an API to separate password storage from applications.
+This API is called the credential provided API and there is a new
+<literal>credential</literal>  command line tool to manage passwords and their aliases.
+The passwords are stored with their aliases in a keystore that is password
+protected.   The keystore password can be the provided to a password prompt
+on the command line, via an environment variable or defaulted to a software
+defined constant.   Please check the Hadoop documentation on the usage
+of this facility.</para></formalpara>
+<simpara>Once the password is stored using the Credential Provider facility and
+the Hadoop configuration has been suitably updated, all applications can
+optionally use the alias in place of the actual password and at runtime
+resolve the alias for the password to use.</simpara>
+<simpara>Since the keystore or similar technology used for storing the credential
+provider is shared across components, passwords for various applications,
+various database and other passwords can be securely stored in them and only
+the alias needs to be exposed in configuration files, protecting the password
+from being visible.</simpara>
+<simpara>Sqoop has been enhanced to allow usage of this funcionality if it is
+available in the underlying Hadoop version being used.   One new option
+has been introduced to provide the alias on the command line instead of the
+actual password (--password-alias).  The argument value this option is
+the alias on the storage associated with the actual password.
+Example usage is as follows:</simpara>
+<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
+    --username dbuser --password-alias mydb.password.alias</screen>
+<simpara>Similarly, if the command line option is not preferred, the alias can be saved
+in the file provided with --password-file option.  Along with this, the
+Sqoop configuration parameter org.apache.sqoop.credentials.loader.class
+should be set to the classname that provides the alias resolution:
+<literal>org.apache.sqoop.util.password.CredentialProviderPasswordLoader</literal></simpara>
+<simpara>Example usage is as follows (assuming .password.alias has the alias for
+the real password) :</simpara>
+<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
+    --username dbuser --password-file ${user.home}/.password-alias</screen>
+<warning><simpara>The <literal>--password</literal> parameter is insecure, as other users may
+be able to read your password from the command-line arguments via
+the output of programs such as <literal>ps</literal>. The <emphasis role="strong"><literal>-P</literal></emphasis> argument is the preferred
+method over using the <literal>--password</literal> argument. Credentials may still be
+transferred between nodes of the MapReduce cluster using insecure means.
+For example:</simpara></warning>
+<screen>$ sqoop import --connect jdbc:mysql://database.example.com/employees \
+    --username aaron --password 12345</screen>
+<simpara>Sqoop automatically supports several databases, including MySQL.  Connect
+strings beginning with <literal>jdbc:mysql://</literal> are handled automatically in Sqoop.  (A
+full list of databases with built-in support is provided in the "Supported
+Databases" section. For some, you may need to install the JDBC driver
+yourself.)</simpara>
+<simpara>You can use Sqoop with any other
+JDBC-compliant database. First, download the appropriate JDBC
+driver for the type of database you want to import, and install the .jar
+file in the <literal>$SQOOP_HOME/lib</literal> directory on your client machine. (This will
+be <literal>/usr/lib/sqoop/lib</literal> if you installed from an RPM or Debian package.)
+Each driver <literal>.jar</literal> file also has a specific driver class which defines
+the entry-point to the driver. For example, MySQL&#8217;s Connector/J library has
+a driver class of <literal>com.mysql.jdbc.Driver</literal>. Refer to your database
+vendor-specific documentation to determine the main driver class.
+This class must be provided as an argument to Sqoop with <literal>--driver</literal>.</simpara>
+<simpara>For example, to connect to a SQLServer database, first download the driver from
+microsoft.com and install it in your Sqoop lib path.</simpara>
+<simpara>Then run Sqoop. For example:</simpara>
+<screen>$ sqoop import --driver com.microsoft.jdbc.sqlserver.SQLServerDriver \
+    --connect &lt;connect-string&gt; ...</screen>
+<simpara>When connecting to a database using JDBC, you can optionally specify extra
+JDBC parameters via a property file using the option
+<literal>--connection-param-file</literal>. The contents of this file are parsed as standard
+Java properties and passed into the driver while creating a connection.</simpara>
+<note><simpara>The parameters specified via the optional property file are only
+applicable to JDBC connections. Any fastpath connectors that use connections
+other than JDBC will ignore these parameters.</simpara></note>
+<table pgwide="0"
+frame="topbot"
+rowsep="1" colsep="1"
+>
+<title>Validation arguments <link linkend="validation">More Details</link></title>
+<tgroup cols="2">
+<colspec colwidth="267*" align="left"/>
+<colspec colwidth="230*" align="left"/>
+<thead>
+  <row>
+    <entry>
+    Argument
+    </entry>
+    <entry>
+    Description
+    </entry>
+  </row>
+</thead>
+<tbody>
+  <row>
+    <entry>
+    <literal>--validate</literal>
+    </entry>
+    <entry>
+    Enable validation of data copied,                                             supports single table copy only.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--validator &lt;class-name&gt;</literal>
+    </entry>
+    <entry>
+    Specify validator class to use.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--validation-threshold &lt;class-name&gt;</literal>
+    </entry>
+    <entry>
+    Specify validation threshold class                                             to use.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--validation-failurehandler &lt;class-name&gt;</literal>
+    </entry>
+    <entry>
+    Specify validation failure                                             handler class to use.
+    </entry>
+  </row>
+</tbody>
+</tgroup>
+</table>
+<table pgwide="0"
+frame="topbot"
+rowsep="1" colsep="1"
+>
+<title>Import control arguments:</title>
+<tgroup cols="2">
+<colspec colwidth="206*" align="left"/>
+<colspec colwidth="236*" align="left"/>
+<thead>
+  <row>
+    <entry>
+    Argument
+    </entry>
+    <entry>
+    Description
+    </entry>
+  </row>
+</thead>
+<tbody>
+  <row>
+    <entry>
+    <literal>--append</literal>
+    </entry>
+    <entry>
+    Append data to an existing dataset                                  in HDFS
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--as-avrodatafile</literal>
+    </entry>
+    <entry>
+    Imports data to Avro Data Files
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--as-sequencefile</literal>
+    </entry>
+    <entry>
+    Imports data to SequenceFiles
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--as-textfile</literal>
+    </entry>
+    <entry>
+    Imports data as plain text (default)
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--as-parquetfile</literal>
+    </entry>
+    <entry>
+    Imports data to Parquet Files
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--boundary-query &lt;statement&gt;</literal>
+    </entry>
+    <entry>
+    Boundary query to use for creating splits
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--columns &lt;col,col,col&#8230;&gt;</literal>
+    </entry>
+    <entry>
+    Columns to import from table
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--delete-target-dir</literal>
+    </entry>
+    <entry>
+    Delete the import target directory                                  if it exists
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--direct</literal>
+    </entry>
+    <entry>
+    Use direct connector if exists for the database
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--fetch-size &lt;n&gt;</literal>
+    </entry>
+    <entry>
+    Number of entries to read from database                                  at once.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--inline-lob-limit &lt;n&gt;</literal>
+    </entry>
+    <entry>
+    Set the maximum size for an inline LOB
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>-m,--num-mappers &lt;n&gt;</literal>
+    </entry>
+    <entry>
+    Use <emphasis>n</emphasis> map tasks to import in parallel
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>-e,--query &lt;statement&gt;</literal>
+    </entry>
+    <entry>
+    Import the results of <emphasis><literal>statement</literal></emphasis>.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--split-by &lt;column-name&gt;</literal>
+    </entry>
+    <entry>
+    Column of the table used to split work                                  units
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--table &lt;table-name&gt;</literal>
+    </entry>
+    <entry>
+    Table to read
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--target-dir &lt;dir&gt;</literal>
+    </entry>
+    <entry>
+    HDFS destination dir
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--warehouse-dir &lt;dir&gt;</literal>
+    </entry>
+    <entry>
+    HDFS parent for table destination
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--where &lt;where clause&gt;</literal>
+    </entry>
+    <entry>
+    WHERE clause to use during import
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>-z,--compress</literal>
+    </entry>
+    <entry>
+    Enable compression
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--compression-codec &lt;c&gt;</literal>
+    </entry>
+    <entry>
+    Use Hadoop codec (default gzip)
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--null-string &lt;null-string&gt;</literal>
+    </entry>
+    <entry>
+    The string to be written for a null                                  value for string columns
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--null-non-string &lt;null-string&gt;</literal>
+    </entry>
+    <entry>
+    The string to be written for a null                                  value for non-string columns
+    </entry>
+  </row>
+</tbody>
+</tgroup>
+</table>
+<simpara>The <literal>--null-string</literal> and <literal>--null-non-string</literal> arguments are optional.\
+If not specified, then the string "null" will be used.</simpara>
+</section>
+<section id="_selecting_the_data_to_import">
+<title>Selecting the Data to Import</title>
+<simpara>Sqoop typically imports data in a table-centric fashion. Use the
+<literal>--table</literal> argument to select the table to import. For example, <literal>--table
+employees</literal>. This argument can also identify a <literal>VIEW</literal> or other table-like
+entity in a database.</simpara>
+<simpara>By default, all columns within a table are selected for import.
+Imported data is written to HDFS in its "natural order;" that is, a
+table containing columns A, B, and C result in an import of data such
+as:</simpara>
+<screen>A1,B1,C1
+A2,B2,C2
+...</screen>
+<simpara>You can select a subset of columns and control their ordering by using
+the <literal>--columns</literal> argument. This should include a comma-delimited list
+of columns to import. For example: <literal>--columns "name,employee_id,jobtitle"</literal>.</simpara>
+<simpara>You can control which rows are imported by adding a SQL <literal>WHERE</literal> clause
+to the import statement. By default, Sqoop generates statements of the
+form <literal>SELECT &lt;column list&gt; FROM &lt;table name&gt;</literal>. You can append a
+<literal>WHERE</literal> clause to this with the <literal>--where</literal> argument. For example: <literal>--where
+"id &gt; 400"</literal>. Only rows where the <literal>id</literal> column has a value greater than
+400 will be imported.</simpara>
+<simpara>By default sqoop will use query <literal>select min(&lt;split-by&gt;), max(&lt;split-by&gt;) from
+&lt;table name&gt;</literal> to find out boundaries for creating splits. In some cases this query
+is not the most optimal so you can specify any arbitrary query returning two
+numeric columns using <literal>--boundary-query</literal> argument.</simpara>
+</section>
+<section id="_free_form_query_imports">
+<title>Free-form Query Imports</title>
+<simpara>Sqoop can also import the result set of an arbitrary SQL query. Instead of
+using the <literal>--table</literal>, <literal>--columns</literal> and <literal>--where</literal> arguments, you can specify
+a SQL statement with the <literal>--query</literal> argument.</simpara>
+<simpara>When importing a free-form query, you must specify a destination directory
+with <literal>--target-dir</literal>.</simpara>
+<simpara>If you want to import the results of a query in parallel, then each map task
+will need to execute a copy of the query, with results partitioned by bounding
+conditions inferred by Sqoop. Your query must include the token <literal>$CONDITIONS</literal>
+which each Sqoop process will replace with a unique condition expression.
+You must also select a splitting column with <literal>--split-by</literal>.</simpara>
+<simpara>For example:</simpara>
+<screen>$ sqoop import \
+  --query 'SELECT a.*, b.* FROM a JOIN b on (a.id == b.id) WHERE $CONDITIONS' \
+  --split-by a.id --target-dir /user/foo/joinresults</screen>
+<simpara>Alternately, the query can be executed once and imported serially, by
+specifying a single map task with <literal>-m 1</literal>:</simpara>
+<screen>$ sqoop import \
+  --query 'SELECT a.*, b.* FROM a JOIN b on (a.id == b.id) WHERE $CONDITIONS' \
+  -m 1 --target-dir /user/foo/joinresults</screen>
+<note><simpara>If you are issuing the query wrapped with double quotes ("),
+you will have to use <literal>\$CONDITIONS</literal> instead of just <literal>$CONDITIONS</literal>
+to disallow your shell from treating it as a shell variable.
+For example, a double quoted query may look like:
+<literal>"SELECT * FROM x WHERE a='foo' AND \$CONDITIONS"</literal></simpara></note>
+<note><simpara>The facility of using free-form query in the current version of Sqoop
+is limited to simple queries where there are no ambiguous projections and
+no <literal>OR</literal> conditions in the <literal>WHERE</literal> clause. Use of complex queries such as
+queries that have sub-queries or joins leading to ambiguous projections can
+lead to unexpected results.</simpara></note>
+</section>
+<section id="_controlling_parallelism">
+<title>Controlling Parallelism</title>
+<simpara>Sqoop imports data in parallel from most database sources. You can
+specify the number
+of map tasks (parallel processes) to use to perform the import by
+using the <literal>-m</literal> or <literal>--num-mappers</literal> argument. Each of these arguments
+takes an integer value which corresponds to the degree of parallelism
+to employ. By default, four tasks are used. Some databases may see
+improved performance by increasing this value to 8 or 16. Do not
+increase the degree of parallelism greater than that available within
+your MapReduce cluster; tasks will run serially and will likely
+increase the amount of time required to perform the import. Likewise,
+do not increase the degree of parallism higher than that which your
+database can reasonably support. Connecting 100 concurrent clients to
+your database may increase the load on the database server to a point
+where performance suffers as a result.</simpara>
+<simpara>When performing parallel imports, Sqoop needs a criterion by which it
+can split the workload. Sqoop uses a <emphasis>splitting column</emphasis> to split the
+workload. By default, Sqoop will identify the primary key column (if
+present) in a table and use it as the splitting column. The low and
+high values for the splitting column are retrieved from the database,
+and the map tasks operate on evenly-sized components of the total
+range. For example, if you had a table with a primary key column of
+<literal>id</literal> whose minimum value was 0 and maximum value was 1000, and Sqoop
+was directed to use 4 tasks, Sqoop would run four processes which each
+execute SQL statements of the form <literal>SELECT * FROM sometable WHERE id
+&gt;= lo AND id &lt; hi</literal>, with <literal>(lo, hi)</literal> set to (0, 250), (250, 500),
+(500, 750), and (750, 1001) in the different tasks.</simpara>
+<simpara>If the actual values for the primary key are not uniformly distributed
+across its range, then this can result in unbalanced tasks. You should
+explicitly choose a different column with the <literal>--split-by</literal> argument.
+For example, <literal>--split-by employee_id</literal>. Sqoop cannot currently split on
+multi-column indices. If your table has no index column, or has a
+multi-column key, then you must also manually choose a splitting
+column.</simpara>
+</section>
+<section id="_controlling_distributed_cache">
+<title>Controlling Distributed Cache</title>
+<simpara>Sqoop will copy the jars in $SQOOP_HOME/lib folder to job cache every
+time when start a Sqoop job. When launched by Oozie this is unnecessary
+since Oozie use its own Sqoop share lib which keeps Sqoop dependencies
+in the distributed cache. Oozie will do the localization on each
+worker node for the Sqoop dependencies only once during the first Sqoop
+job and reuse the jars on worker node for subsquencial jobs. Using
+option <literal>--skip-dist-cache</literal> in Sqoop command when launched by Oozie will
+skip the step which Sqoop copies its dependencies to job cache and save
+massive I/O.</simpara>
+</section>
+<section id="_controlling_the_import_process">
+<title>Controlling the Import Process</title>
+<simpara>By default, the import process will use JDBC which provides a
+reasonable cross-vendor import channel. Some databases can perform
+imports in a more high-performance fashion by using database-specific
+data movement tools. For example, MySQL provides the <literal>mysqldump</literal> tool
+which can export data from MySQL to other systems very quickly. By
+supplying the <literal>--direct</literal> argument, you are specifying that Sqoop
+should attempt the direct import channel. This channel may be
+higher performance than using JDBC.</simpara>
+<simpara>Details about use of direct mode with each specific RDBMS, installation requirements, available
+options and limitations can be found in <xref linkend="connectors"/>.</simpara>
+<simpara>By default, Sqoop will import a table named <literal>foo</literal> to a directory named
+<literal>foo</literal> inside your home directory in HDFS. For example, if your
+username is <literal>someuser</literal>, then the import tool will write to
+<literal>/user/someuser/foo/(files)</literal>. You can adjust the parent directory of
+the import with the <literal>--warehouse-dir</literal> argument. For example:</simpara>
+<screen>$ sqoop import --connnect &lt;connect-str&gt; --table foo --warehouse-dir /shared \
+    ...</screen>
+<simpara>This command would write to a set of files in the <literal>/shared/foo/</literal> directory.</simpara>
+<simpara>You can also explicitly choose the target directory, like so:</simpara>
+<screen>$ sqoop import --connnect &lt;connect-str&gt; --table foo --target-dir /dest \
+    ...</screen>
+<simpara>This will import the files into the <literal>/dest</literal> directory. <literal>--target-dir</literal> is
+incompatible with <literal>--warehouse-dir</literal>.</simpara>
+<simpara>When using direct mode, you can specify additional arguments which
+should be passed to the underlying tool. If the argument
+<literal>--</literal> is given on the command-line, then subsequent arguments are sent
+directly to the underlying tool. For example, the following adjusts
+the character set used by <literal>mysqldump</literal>:</simpara>
+<screen>$ sqoop import --connect jdbc:mysql://server.foo.com/db --table bar \
+    --direct -- --default-character-set=latin1</screen>
+<simpara>By default, imports go to a new target location. If the destination directory
+already exists in HDFS, Sqoop will refuse to import and overwrite that
+directory&#8217;s contents. If you use the <literal>--append</literal> argument, Sqoop will import
+data to a temporary directory and then rename the files into the normal
+target directory in a manner that does not conflict with existing filenames
+in that directory.</simpara>
+</section>
+<section id="_controlling_transaction_isolation">
+<title>Controlling transaction isolation</title>
+<simpara>By default, Sqoop uses the read committed transaction isolation in the mappers
+to import data.  This may not be the ideal in all ETL workflows and it may
+desired to reduce the isolation guarantees.   The <literal>--relaxed-isolation</literal> option
+can be used to instruct Sqoop to use read uncommitted isolation level.</simpara>
+<simpara>The <literal>read-uncommitted</literal> isolation level is not supported on all databases
+(for example, Oracle), so specifying the option <literal>--relaxed-isolation</literal>
+may not be supported on all databases.</simpara>
+</section>
+<section id="_controlling_type_mapping">
+<title>Controlling type mapping</title>
+<simpara>Sqoop is preconfigured to map most SQL types to appropriate Java or Hive
+representatives. However the default mapping might not be suitable for
+everyone and might be overridden by <literal>--map-column-java</literal> (for changing
+mapping to Java) or <literal>--map-column-hive</literal> (for changing Hive mapping).</simpara>
+<table pgwide="0"
+frame="topbot"
+rowsep="1" colsep="1"
+>
+<title>Parameters for overriding mapping</title>
+<tgroup cols="2">
+<colspec colwidth="206*" align="left"/>
+<colspec colwidth="236*" align="left"/>
+<thead>
+  <row>
+    <entry>
+    Argument
+    </entry>
+    <entry>
+    Description
+    </entry>
+  </row>
+</thead>
+<tbody>
+  <row>
+    <entry>
+    <literal>--map-column-java &lt;mapping&gt;</literal>
+    </entry>
+    <entry>
+    Override mapping from SQL to Java type                                  for configured columns.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--map-column-hive &lt;mapping&gt;</literal>
+    </entry>
+    <entry>
+    Override mapping from SQL to Hive type                                  for configured columns.
+    </entry>
+  </row>
+</tbody>
+</tgroup>
+</table>
+<simpara>Sqoop is expecting comma separated list of mapping in form &lt;name of column&gt;=&lt;new type&gt;. For example:</simpara>
+<screen>$ sqoop import ... --map-column-java id=String,value=Integer</screen>
+<simpara>Sqoop will rise exception in case that some configured mapping will not be used.</simpara>
+</section>
+<section id="_incremental_imports">
+<title>Incremental Imports</title>
+<simpara>Sqoop provides an incremental import mode which can be used to retrieve
+only rows newer than some previously-imported set of rows.</simpara>
+<simpara>The following arguments control incremental imports:</simpara>
+<table pgwide="0"
+frame="topbot"
+rowsep="1" colsep="1"
+>
+<title>Incremental import arguments:</title>
+<tgroup cols="2">
+<colspec colwidth="182*" align="left"/>
+<colspec colwidth="236*" align="left"/>
+<thead>
+  <row>
+    <entry>
+    Argument
+    </entry>
+    <entry>
+    Description
+    </entry>
+  </row>
+</thead>
+<tbody>
+  <row>
+    <entry>
+    <literal>--check-column (col)</literal>
+    </entry>
+    <entry>
+    Specifies the column to be examined                               when determining which rows to import.                              (the column should not be of type                               CHAR/NCHAR/VARCHAR/VARNCHAR/                              LONGVARCHAR/LONGNVARCHAR)
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--incremental (mode)</literal>
+    </entry>
+    <entry>
+    Specifies how Sqoop determines which                               rows are new. Legal values for <literal>mode</literal>                              include <literal>append</literal> and <literal>lastmodified</literal>.
+    </entry>
+  </row>
+  <row>
+    <entry>
+    <literal>--last-value (value)</literal>
+    </entry>
+    <entry>
+    Specifies the maximum value of the                               check column from the previous import.
+    </entry>
+  </row>
+</tbody>
+</tgroup>
+</table>
+<simpara>Sqoop supports two types of incremental imports: <literal>append</literal> and <literal>lastmodified</literal>.
+You can use the <literal>--incremental</literal> argument to specify the type of incremental
+import to perform.</simpara>
+<simpara>You should specify <literal>append</literal> mode when importing a table where new rows are
+continually being added with increasing row id values. You specify the column
+containing the row&#8217;s id with <literal>--check-column</literal>. Sqoop imports rows where the
+check column has a value greater than the one specified with <literal>--last-value</literal>.</simpara>
+<simpara>An alternate table update strategy supported by Sqoop is called <literal>lastmodified</literal>
+mode. You should use this when rows of the source table may be updated, and
+each such update will set the value of a last-modified column to the current
+timestamp.  Rows where the check column holds a timestamp more recent than the
+timestamp specified with <literal>--last-value</literal> are imported.</simpara>
+<simpara>At the end of an incremental import, the value which should be specified as
+<literal>--last-value</literal> for a subsequent import is printed to the screen. When running
+a subsequent import, you should specify <literal>--last-value</literal> in this way to ensure
+you import only the new or updated data. This is handled automatically by
+creating an incremental import as a saved job, which is the preferred
+mechanism for performing a recurring incremental import. See the section on
+saved jobs later in this document for more information.</simpara>
+</section>
+<section id="_file_formats">
+<title>File Formats</title>
+<simpara>You can import data in one of two file formats: delimited text or
+SequenceFiles.</simpara>
+<simpara>Delimited text is the default import format. You can also specify it
+explicitly by using the <literal>--as-textfile</literal> argument. This argument will write
+string-based representations of each record to the output files, with
+delimiter characters between individual columns and rows. These
+delimiters may be commas, tabs, or other characters. (The delimiters
+can be selected; see "Output line formatting arguments.") The
+following is the results of an example text-based import:</simpara>
+<screen>1,here is a message,2010-05-01
+2,happy new year!,2010-01-01
+3,another message,2009-11-12</screen>
+<simpara>Delimited text is appropriate for most non-binary data types. It also
+readily supports further manipulation by other tools, such as Hive.</simpara>

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/docs/user/import-all-tables-purpose.txt
----------------------------------------------------------------------
diff --git a/src/docs/user/import-all-tables-purpose.txt b/src/docs/user/import-all-tables-purpose.txt
index d5df88c..d1d39a4 100644
--- a/src/docs/user/import-all-tables-purpose.txt
+++ b/src/docs/user/import-all-tables-purpose.txt
@@ -7,9 +7,9 @@
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at
- 
+
       http://www.apache.org/licenses/LICENSE-2.0
- 
+
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -23,7 +23,7 @@ Data from each table is stored in a separate directory in HDFS.
 For the +import-all-tables+ tool to be useful, the following conditions
 must be met:
 
-- Each table must have a single-column primary key.
+- Each table must have a single-column primary key or +--autoreset-to-one-mapper+ option must be used.
 - You must intend to import all columns of each table.
 - You must not intend to use non-default splitting column, nor impose
   any conditions via a +WHERE+ clause.

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/docs/user/import-all-tables.txt
----------------------------------------------------------------------
diff --git a/src/docs/user/import-all-tables.txt b/src/docs/user/import-all-tables.txt
index 166825a..fbad47b 100644
--- a/src/docs/user/import-all-tables.txt
+++ b/src/docs/user/import-all-tables.txt
@@ -6,9 +6,9 @@
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at
- 
+
       http://www.apache.org/licenses/LICENSE-2.0
- 
+
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -56,6 +56,8 @@ Argument                     Description
 +\--compression-codec <c>+   Use Hadoop codec (default gzip)
 +\--exclude-tables <tables>+ Comma separated list of tables to exclude\
                              from import process
++\--autoreset-to-one-mapper+ Import should use one mapper if a table\
+                             with no primary key is encountered
 ---------------------------------------------------------------------
 
 These arguments behave in the same manner as they do when used for the

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/docs/user/import.txt
----------------------------------------------------------------------
diff --git a/src/docs/user/import.txt b/src/docs/user/import.txt
index a6b23df..df04157 100644
--- a/src/docs/user/import.txt
+++ b/src/docs/user/import.txt
@@ -71,7 +71,12 @@ Argument                          Description
 +-m,\--num-mappers <n>+           Use 'n' map tasks to import in parallel
 +-e,\--query <statement>+         Import the results of '+statement+'.
 +\--split-by <column-name>+       Column of the table used to split work\
-                                  units
+                                  units.  Cannot be used with\
+                                  +--autoreset-to-one-mapper+ option.
++\--autoreset-to-one-mapper+      Import should use one mapper if a table\
+                                  has no primary key and no split-by column\
+                                  is provided.  Cannot be used with\
+                                  +--split-by <col>+ option.
 +\--table <table-name>+           Table to read
 +\--target-dir <dir>+             HDFS destination dir
 +\--warehouse-dir <dir>+          HDFS parent for table destination
@@ -206,6 +211,13 @@ multi-column indices. If your table has no index column, or has a
 multi-column key, then you must also manually choose a splitting
 column.
 
+If a table does not have a primary key defined and the +--split-by <col>+
+is not provided, then import will fail unless the number
+of mappers is explicitly set to one with the +--num-mappers 1+ option
+or the +--autoreset-to-one-mapper+ option is used.  The option
++--autoreset-to-one-mapper+ is typically used with the import-all-tables
+tool to automatically handle tables without a primary key in a schema.
+
 include::distributed-cache.txt[]
 
 Controlling the Import Process

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/SqoopOptions.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/SqoopOptions.java b/src/java/org/apache/sqoop/SqoopOptions.java
index 0070d0b..fc3d847 100644
--- a/src/java/org/apache/sqoop/SqoopOptions.java
+++ b/src/java/org/apache/sqoop/SqoopOptions.java
@@ -311,6 +311,10 @@ public class SqoopOptions implements Cloneable {
   // Relaxed Isolation
   @StoredAsProperty("relaxed.isolation") private boolean relaxedIsolation;
 
+  // Use single mapper for non-primary key tables without
+  // explicit split by cols
+  @StoredAsProperty("reset.onemapper") private boolean autoResetToOneMapper;
+
   // These next two fields are not serialized to the metastore.
   // If this SqoopOptions is created by reading a saved job, these will
   // be populated by the JobStorage to facilitate updating the same
@@ -2496,10 +2500,19 @@ public class SqoopOptions implements Cloneable {
     return this.relaxedIsolation;
   }
 
+  public void setAutoResetToOneMapper(boolean b) {
+	 this.autoResetToOneMapper = b;
+  }
+
+  public boolean getAutoResetToOneMapper() {
+	return this.autoResetToOneMapper;
+  }
+
   public String getHCatalogPartitionKeys() {
     return hCatalogPartitionKeys;
   }
 
+
   public void setHCatalogPartitionKeys(String hpks) {
     this.hCatalogPartitionKeys = hpks;
   }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/accumulo/AccumuloUtil.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/accumulo/AccumuloUtil.java b/src/java/org/apache/sqoop/accumulo/AccumuloUtil.java
index 06888c5..e6d605c 100644
--- a/src/java/org/apache/sqoop/accumulo/AccumuloUtil.java
+++ b/src/java/org/apache/sqoop/accumulo/AccumuloUtil.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Set;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -30,6 +31,7 @@ import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.sqoop.SqoopOptions;
 import org.apache.sqoop.config.ConfigurationConstants;
+import org.apache.sqoop.config.ConfigurationHelper;
 
 /**
  * This class provides a method that checks if Accumulo jars are present in the
@@ -73,7 +75,7 @@ public final class AccumuloUtil {
    */
   public static void addJars(Job job, SqoopOptions options) throws IOException {
 
-    if (isLocalJobTracker(job)) {
+    if (ConfigurationHelper.isLocalJobTracker(job.getConfiguration())) {
       LOG.info("Not adding Accumulo jars to distributed cache in local mode");
     } else if (options.isSkipDistCache()) {
       LOG.info("Not adding Accumulo jars to distributed cache as requested");
@@ -168,24 +170,4 @@ public final class AccumuloUtil {
     }
   }
 
-  /**
-   * Check to see if the job is running in local mode.
-   */
-  public static boolean isLocalJobTracker(Job job) {
-    boolean retval = false;
-    Configuration conf = job.getConfiguration();
-    // If framework is set to YARN, then we can't be running in local mode
-    if ("yarn".equalsIgnoreCase(conf
-      .get(ConfigurationConstants.PROP_MAPREDUCE_FRAMEWORK_NAME))) {
-      retval = false;
-    } else {
-      String jtAddr = conf
-        .get(ConfigurationConstants.PROP_MAPRED_JOB_TRACKER_ADDRESS);
-      String jtAddr2 = conf
-        .get(ConfigurationConstants.PROP_MAPREDUCE_JOB_TRACKER_ADDRESS);
-      retval = (jtAddr != null && jtAddr.equals("local"))
-         || (jtAddr2 != null && jtAddr2.equals("local"));
-    }
-    return retval;
-  }
 }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/config/ConfigurationHelper.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/config/ConfigurationHelper.java b/src/java/org/apache/sqoop/config/ConfigurationHelper.java
index 5e4230e..8dc2061 100644
--- a/src/java/org/apache/sqoop/config/ConfigurationHelper.java
+++ b/src/java/org/apache/sqoop/config/ConfigurationHelper.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.util.GenericOptionsParser;
 
 import com.cloudera.sqoop.mapreduce.db.DBConfiguration;
+
 import org.apache.hadoop.util.ReflectionUtils;
 
 /**
@@ -39,19 +40,34 @@ import org.apache.hadoop.util.ReflectionUtils;
 public final class ConfigurationHelper {
 
   /**
+   * We track the number of maps in local mode separately as
+   * mapred.map.tasks or mapreduce.job.maps is ignored in local mode and will
+   * always return 1 irrespective of what we set the value to in the
+   * configuration.
+   */
+  public static int numLocalModeMaps = 1;
+  /**
    * Set the (hinted) number of map tasks for a job.
    */
   public static void setJobNumMaps(Job job, int numMapTasks) {
-    job.getConfiguration().setInt(
+    if (isLocalJobTracker(job.getConfiguration())) {
+      numLocalModeMaps = numMapTasks;
+    } else {
+      job.getConfiguration().setInt(
         ConfigurationConstants.PROP_MAPRED_MAP_TASKS, numMapTasks);
+    }
   }
 
   /**
    * Get the (hinted) number of map tasks for a job.
    */
   public static int getJobNumMaps(JobContext job) {
-    return job.getConfiguration().getInt(
+    if (isLocalJobTracker(job.getConfiguration())) {
+      return numLocalModeMaps;
+    } else {
+      return job.getConfiguration().getInt(
         ConfigurationConstants.PROP_MAPRED_MAP_TASKS, 1);
+    }
   }
 
   /**
@@ -78,7 +94,11 @@ public final class ConfigurationHelper {
    * Get the (hinted) number of map tasks for a job.
    */
   public static int getConfNumMaps(Configuration conf) {
-    return conf.getInt(ConfigurationConstants.PROP_MAPRED_MAP_TASKS, 1);
+    if (isLocalJobTracker(conf)) {
+      return numLocalModeMaps;
+    } else {
+      return conf.getInt(ConfigurationConstants.PROP_MAPRED_MAP_TASKS, 1);
+    }
   }
 
   /**
@@ -193,6 +213,20 @@ public final class ConfigurationHelper {
     return ret;
   }
 
+  public static boolean isLocalJobTracker(Configuration conf) {
+    // If framework is set to YARN, then we can't be running in local mode
+    if ("yarn".equalsIgnoreCase(conf
+      .get(ConfigurationConstants.PROP_MAPREDUCE_FRAMEWORK_NAME))) {
+      return false;
+    }
+    String jtAddr = conf
+      .get(ConfigurationConstants.PROP_MAPRED_JOB_TRACKER_ADDRESS);
+    String jtAddr2 = conf
+      .get(ConfigurationConstants.PROP_MAPREDUCE_JOB_TRACKER_ADDRESS);
+    return (jtAddr != null && jtAddr.equals("local"))
+      || (jtAddr2 != null && jtAddr2.equals("local"));
+  }
+
   private ConfigurationHelper() {
     // Disable explicit object creation
   }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/manager/SqlManager.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/manager/SqlManager.java b/src/java/org/apache/sqoop/manager/SqlManager.java
index c81f170..ead581d 100644
--- a/src/java/org/apache/sqoop/manager/SqlManager.java
+++ b/src/java/org/apache/sqoop/manager/SqlManager.java
@@ -276,7 +276,7 @@ public abstract class SqlManager
     }
 
     try {
-      Map<String, List<Integer>> colInfo = 
+      Map<String, List<Integer>> colInfo =
           new SqlTypeMap<String, List<Integer>>();
 
       int cols = results.getMetaData().getColumnCount();
@@ -618,10 +618,15 @@ public abstract class SqlManager
     // correctly.
     String splitCol = getSplitColumn(opts, tableName);
     if (null == splitCol && opts.getNumMappers() > 1) {
-      // Can't infer a primary key.
-      throw new ImportException("No primary key could be found for table "
+      if (!opts.getAutoResetToOneMapper()) {
+        // Can't infer a primary key.
+        throw new ImportException("No primary key could be found for table "
           + tableName + ". Please specify one with --split-by or perform "
           + "a sequential import with '-m 1'.");
+      } else {
+        LOG.warn("Split by column not provided or can't be inferred.  Resetting to one mapper");
+        opts.setNumMappers(1);
+      }
     }
   }
 

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/mapreduce/hcat/SqoopHCatUtilities.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/mapreduce/hcat/SqoopHCatUtilities.java b/src/java/org/apache/sqoop/mapreduce/hcat/SqoopHCatUtilities.java
index ee25d0e..4070c24 100644
--- a/src/java/org/apache/sqoop/mapreduce/hcat/SqoopHCatUtilities.java
+++ b/src/java/org/apache/sqoop/mapreduce/hcat/SqoopHCatUtilities.java
@@ -63,6 +63,7 @@ import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
 import org.apache.hive.hcatalog.mapreduce.HCatOutputFormat;
 import org.apache.hive.hcatalog.mapreduce.OutputJobInfo;
 import org.apache.sqoop.config.ConfigurationConstants;
+import org.apache.sqoop.config.ConfigurationHelper;
 import org.apache.sqoop.hive.HiveTypes;
 import org.apache.sqoop.manager.ConnManager;
 import org.apache.sqoop.util.Executor;
@@ -857,7 +858,7 @@ public final class SqoopHCatUtilities {
    */
   public static void addJars(Job job, SqoopOptions options) throws IOException {
 
-    if (isLocalJobTracker(job)) {
+    if (ConfigurationHelper.isLocalJobTracker(job.getConfiguration())) {
       LOG.info("Not adding hcatalog jars to distributed cache in local mode");
       return;
     }
@@ -1033,23 +1034,9 @@ public final class SqoopHCatUtilities {
    }
  }
 
-  public static boolean isLocalJobTracker(Job job) {
-    Configuration conf = job.getConfiguration();
-    // If framework is set to YARN, then we can't be running in local mode
-    if ("yarn".equalsIgnoreCase(conf
-      .get(ConfigurationConstants.PROP_MAPREDUCE_FRAMEWORK_NAME))) {
-      return false;
-    }
-    String jtAddr = conf
-      .get(ConfigurationConstants.PROP_MAPRED_JOB_TRACKER_ADDRESS);
-    String jtAddr2 = conf
-      .get(ConfigurationConstants.PROP_MAPREDUCE_JOB_TRACKER_ADDRESS);
-    return (jtAddr != null && jtAddr.equals("local"))
-      || (jtAddr2 != null && jtAddr2.equals("local"));
-  }
 
   public void invokeOutputCommitterForLocalMode(Job job) throws IOException {
-    if (isLocalJobTracker(job) && isHadoop1()) {
+    if (ConfigurationHelper.isLocalJobTracker(job.getConfiguration()) && isHadoop1()) {
       // HCatalog 0.11- do have special class HCatHadoopShims, however this
       // class got merged into Hive Shim layer in 0.12+. Following method will
       // try to find correct implementation via reflection.

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/tool/BaseSqoopTool.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/tool/BaseSqoopTool.java b/src/java/org/apache/sqoop/tool/BaseSqoopTool.java
index b41ee2d..058e81d 100644
--- a/src/java/org/apache/sqoop/tool/BaseSqoopTool.java
+++ b/src/java/org/apache/sqoop/tool/BaseSqoopTool.java
@@ -220,6 +220,12 @@ public abstract class BaseSqoopTool extends com.cloudera.sqoop.tool.SqoopTool {
   public static final String OLD_DATASET_ARG = "onto";
   public static final String MERGE_KEY_ARG = "merge-key";
 
+  // Reset number of mappers to one if there is no primary key avaliable and
+  // split by column is explicitly not provided
+
+  public static final String AUTORESET_TO_ONE_MAPPER = "autoreset-to-one-mapper";
+
+
   public BaseSqoopTool() {
   }
 

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/tool/ImportAllTablesTool.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/tool/ImportAllTablesTool.java b/src/java/org/apache/sqoop/tool/ImportAllTablesTool.java
index ab95959..0952c11 100644
--- a/src/java/org/apache/sqoop/tool/ImportAllTablesTool.java
+++ b/src/java/org/apache/sqoop/tool/ImportAllTablesTool.java
@@ -98,10 +98,16 @@ public class ImportAllTablesTool extends com.cloudera.sqoop.tool.ImportTool {
         LOG.error("manager.listTables() returned null");
         return 1;
       } else {
+        int numMappers = options.getNumMappers();
         for (String tableName : tables) {
           if (excludes.contains(tableName)) {
             System.out.println("Skipping table: " + tableName);
           } else {
+            /*
+             * Number of mappers could be potentially reset in imports.  So
+             * we set it to the configured number before each import.
+             */
+            options.setNumMappers(numMappers);
             importTable(options, tableName, hiveImport);
           }
         }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/java/org/apache/sqoop/tool/ImportTool.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/sqoop/tool/ImportTool.java b/src/java/org/apache/sqoop/tool/ImportTool.java
index d5bf1eb..c79e044 100644
--- a/src/java/org/apache/sqoop/tool/ImportTool.java
+++ b/src/java/org/apache/sqoop/tool/ImportTool.java
@@ -750,7 +750,10 @@ public class ImportTool extends com.cloudera.sqoop.tool.BaseSqoopTool {
         + "database when more rows are needed")
         .withLongOpt(FETCH_SIZE_ARG)
         .create());
-
+    importOpts.addOption(OptionBuilder.withArgName("reset-mappers")
+      .withDescription("Reset the number of mappers to one mapper if no split key available")
+      .withLongOpt(AUTORESET_TO_ONE_MAPPER)
+      .create());
     return importOpts;
   }
 
@@ -969,6 +972,10 @@ public class ImportTool extends com.cloudera.sqoop.tool.BaseSqoopTool {
         out.setExistingJarName(in.getOptionValue(JAR_FILE_NAME_ARG));
       }
 
+      if (in.hasOption(AUTORESET_TO_ONE_MAPPER)) {
+        out.setAutoResetToOneMapper(true);
+      }
+
       applyIncrementalOptions(in, out);
       applyHiveOptions(in, out);
       applyOutputFormatOptions(in, out);
@@ -1066,6 +1073,10 @@ public class ImportTool extends com.cloudera.sqoop.tool.BaseSqoopTool {
          != SqoopOptions.IncrementalMode.None) {
        throw new InvalidOptionsException("--delete-target-dir can not be used"
          + " with incremental imports.");
+    } else if (options.getAutoResetToOneMapper()
+        && (options.getSplitByCol() != null)) {
+        throw new InvalidOptionsException("--autoreset-to-one-mapper and"
+          + " --split-by cannot be used together.");
     }
   }
 

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/com/cloudera/sqoop/TestAvroImport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestAvroImport.java b/src/test/com/cloudera/sqoop/TestAvroImport.java
index 440e76c..dd051f3 100644
--- a/src/test/com/cloudera/sqoop/TestAvroImport.java
+++ b/src/test/com/cloudera/sqoop/TestAvroImport.java
@@ -66,7 +66,8 @@ public class TestAvroImport extends ImportJobTestCase {
     if (includeHadoopFlags) {
       CommonArgs.addHadoopFlags(args);
     }
-
+    args.add("-m");
+    args.add("1");
     args.add("--table");
     args.add(getTableName());
     args.add("--connect");

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/com/cloudera/sqoop/TestBoundaryQuery.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestBoundaryQuery.java b/src/test/com/cloudera/sqoop/TestBoundaryQuery.java
index dc82de2..99f652e 100644
--- a/src/test/com/cloudera/sqoop/TestBoundaryQuery.java
+++ b/src/test/com/cloudera/sqoop/TestBoundaryQuery.java
@@ -170,7 +170,7 @@ public class TestBoundaryQuery extends ImportJobTestCase {
       + getTableName() +" where intfield1 in (3, 5)";
 
     runQueryTest(query, true, 2, 8, getTablePath().toString(),
-        "--split-by", "INTFIELD1");
+      "--m", "1", "--split-by", "INTFIELD1");
   }
 
   public void testNoBoundaryQuerySingleMapper() throws IOException {

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/com/cloudera/sqoop/TestParquetImport.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestParquetImport.java b/src/test/com/cloudera/sqoop/TestParquetImport.java
index 192da17..21a2fec 100644
--- a/src/test/com/cloudera/sqoop/TestParquetImport.java
+++ b/src/test/com/cloudera/sqoop/TestParquetImport.java
@@ -66,6 +66,8 @@ public class TestParquetImport extends ImportJobTestCase {
     args.add(HsqldbTestServer.getUrl());
     args.add("--warehouse-dir");
     args.add(getWarehouseDir());
+    args.add("--m");
+    args.add("1");
     args.add("--split-by");
     args.add("INTFIELD1");
     args.add("--as-parquetfile");

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/com/cloudera/sqoop/TestSqoopOptions.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/TestSqoopOptions.java b/src/test/com/cloudera/sqoop/TestSqoopOptions.java
index 60460d9..4ae583d 100644
--- a/src/test/com/cloudera/sqoop/TestSqoopOptions.java
+++ b/src/test/com/cloudera/sqoop/TestSqoopOptions.java
@@ -475,4 +475,25 @@ public class TestSqoopOptions extends TestCase {
     };
     validateImportOptions(extraArgs);
   }
+
+  public void testResetToOneMapper() throws Exception {
+    String extraArgs[] = {
+      "--autoreset-to-one-mapper",
+    };
+    validateImportOptions(extraArgs);
+  }
+
+  public void testResetToOneMapperAndSplitBy() throws Exception {
+    String extraArgs[] = {
+      "--autoreset-to-one-mapper",
+      "--split-by",
+      "col0",
+    };
+    try {
+      validateImportOptions(extraArgs);
+      fail("Expected InvalidOptionsException");
+    } catch (SqoopOptions.InvalidOptionsException ioe) {
+      // Expected
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/com/cloudera/sqoop/testutil/ImportJobTestCase.java
----------------------------------------------------------------------
diff --git a/src/test/com/cloudera/sqoop/testutil/ImportJobTestCase.java b/src/test/com/cloudera/sqoop/testutil/ImportJobTestCase.java
index d5b7f8f..dbd9061 100644
--- a/src/test/com/cloudera/sqoop/testutil/ImportJobTestCase.java
+++ b/src/test/com/cloudera/sqoop/testutil/ImportJobTestCase.java
@@ -94,7 +94,7 @@ public abstract class ImportJobTestCase extends BaseSqoopTestCase {
     args.add(getConnectString());
     args.add("--as-sequencefile");
     args.add("--num-mappers");
-    args.add("2");
+    args.add("1");
 
     args.addAll(getExtraArgs(conf));
 

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/org/apache/sqoop/TestAutoResetMapper.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/sqoop/TestAutoResetMapper.java b/src/test/org/apache/sqoop/TestAutoResetMapper.java
new file mode 100644
index 0000000..4c8282c
--- /dev/null
+++ b/src/test/org/apache/sqoop/TestAutoResetMapper.java
@@ -0,0 +1,191 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.sqoop;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.sqoop.tool.ImportAllTablesTool;
+import org.junit.After;
+import org.junit.Before;
+
+import com.cloudera.sqoop.testutil.ImportJobTestCase;
+
+public class TestAutoResetMapper extends ImportJobTestCase {
+
+  /** the names of the tables we're creating. */
+  private List<String> tableNames;
+
+  private String[][] expectedStrings;
+
+  public static final Log LOG =
+      LogFactory.getLog(TestAutoResetMapper.class.getName());
+
+  private static  String[][] types = {
+    { "INT NOT NULL", "VARCHAR(32)" },
+    { "INT NOT NULL PRIMARY KEY", "VARCHAR(32)" },
+    { "INT NOT NULL", "VARCHAR(32)" },
+    { "INT NOT NULL PRIMARY KEY", "VARCHAR(32)" },
+    { "INT NOT NULL", "VARCHAR(32)" },
+  };
+
+  private int[] expectedPartFiles = { 1, 2, 1, 2, 1};
+
+  /**
+   * Create the argv to pass to Sqoop.
+   * @return the argv as an array of strings.
+   */
+  private String [] getArgv() {
+    ArrayList<String> args = new ArrayList<String>();
+
+    args.add("-D");
+    args.add("mapreduce.jobtracker.address=local");
+    args.add("-D");
+    args.add("fs.defaultFS=file:///");
+    args.add("-D");
+    args.add("jobclient.completion.poll.interval=50");
+    args.add("-D");
+    args.add("jobclient.progress.monitor.poll.interval=50");
+
+    args.add("--warehouse-dir");
+    args.add(getWarehouseDir());
+    args.add("--num-mappers");
+    args.add("2");
+    args.add("--connect");
+    args.add(getConnectString());
+    args.add("--escaped-by");
+    args.add("\\");
+    args.add("--autoreset-to-one-mapper");
+    args.add("--verbose");
+
+    return args.toArray(new String[0]);
+  }
+
+  @Before
+  public void setUp() {
+    // start the server
+    super.setUp();
+
+    if (useHsqldbTestServer()) {
+      // throw away TWOINTTABLE and things we don't care about.
+      try {
+        this.getTestServer().dropExistingSchema();
+      } catch (SQLException sqlE) {
+        fail(sqlE.toString());
+      }
+    }
+
+    this.tableNames = new ArrayList<String>();
+
+    int numTables = types.length;
+
+    this.expectedStrings = new String[numTables][];
+
+    int numRows = 2;
+
+    for (int i = 0; i < numTables; ++i) {
+      expectedStrings[i] = new String[numRows];
+      List<String> vals = new ArrayList<String>();
+      for (int j = 0; j < numRows; ++j) {
+        String num = Integer.toString(j + 1);
+        String str = "Table " + Integer.toString(i + 1) + " Row " + num;
+        vals.add(num);
+        vals.add("'" + str + "'");
+        expectedStrings[i][j] = num + "," + str;
+      }
+      this.createTableWithColTypes(types[i], vals.toArray(new String[vals.size()]));
+      this.tableNames.add(this.getTableName());
+      this.removeTableDir();
+      incrementTableNum();
+    };
+
+  }
+
+  @After
+  public void tearDown() {
+    try {
+      for (String table : tableNames) {
+        dropTableIfExists(table);
+      }
+    } catch(SQLException e) {
+      LOG.error("Can't clean up the database:", e);
+    }
+    super.tearDown();
+  }
+
+  public void testMultiTableImportWithAutoMapperReset() throws IOException {
+
+    String[] argv = getArgv();
+    runImport(new ImportAllTablesTool(), argv);
+
+    Path warehousePath = new Path(this.getWarehouseDir());
+    FileSystem fs = FileSystem.get(getConf());
+    for (int i = 0; i < this.tableNames.size(); ++i) {
+      String tableName = this.tableNames.get(i);
+      LOG.debug("Validating import of " + tableName);
+      Path tablePath = new Path(warehousePath, tableName);
+      int numPartFiles = 0;
+      List<String> importedData = new ArrayList<String>();
+      // We expect utmost 2 files
+      for (int m = 0; m < 2 ; ++m) {
+        Path filePath = new Path(tablePath, "part-m-0000" + Integer.toString(m));
+        if (fs.exists(filePath)) {
+          ++numPartFiles;
+          LOG.debug("Reading imported file " + filePath);
+          BufferedReader reader = null;
+          if (!isOnPhysicalCluster()) {
+            reader = new BufferedReader(
+              new InputStreamReader(new FileInputStream(
+                new File(filePath.toString()))));
+          } else {
+            FileSystem dfs = FileSystem.get(getConf());
+            FSDataInputStream dis = dfs.open(filePath);
+            reader = new BufferedReader(new InputStreamReader(dis));
+          }
+          String line = null;
+          try {
+            while ((line = reader.readLine()) != null) {
+              importedData.add(line);
+            }
+          } finally {
+            IOUtils.closeStream(reader);
+          }
+        }
+      }
+      assertEquals("Table " + tableName + " expected a different number of part files",
+         expectedPartFiles[i], numPartFiles);
+      for (int k = 0; k < this.expectedStrings[i].length; ++k) {
+        assertEquals("Table " + tableName + "expected a different string",
+          expectedStrings[i][k], importedData.get(k));
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/sqoop/blob/2576ca2f/src/test/org/apache/sqoop/mapreduce/mainframe/TestMainframeDatasetInputFormat.java
----------------------------------------------------------------------
diff --git a/src/test/org/apache/sqoop/mapreduce/mainframe/TestMainframeDatasetInputFormat.java b/src/test/org/apache/sqoop/mapreduce/mainframe/TestMainframeDatasetInputFormat.java
index e386fb0..ba7d240 100644
--- a/src/test/org/apache/sqoop/mapreduce/mainframe/TestMainframeDatasetInputFormat.java
+++ b/src/test/org/apache/sqoop/mapreduce/mainframe/TestMainframeDatasetInputFormat.java
@@ -43,6 +43,8 @@ import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
+import com.cloudera.sqoop.config.ConfigurationHelper;
+
 public class TestMainframeDatasetInputFormat {
 
   private MainframeDatasetInputFormat<SqoopRecord> format;
@@ -97,7 +99,8 @@ public class TestMainframeDatasetInputFormat {
     String dsName = "dsName1";
     conf.set(MainframeConfiguration.MAINFRAME_INPUT_DATASET_NAME, dsName);
     Job job = new Job(conf);
-    format.getSplits(job);
+    ConfigurationHelper.setJobNumMaps(job, 2);
+    //format.getSplits(job);
 
     List<InputSplit> splits = new ArrayList<InputSplit>();
     splits = ((MainframeDatasetInputFormat<SqoopRecord>) format).getSplits(job);


Mime
View raw message