K
- the type of the input keysV
- the type of the input valuespublic class KMeansClustering<K,V> extends BaseJob<K,V>
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
ARG_CLUSTERS_POINTS |
static java.lang.String |
ARG_CRIT_FUN_CLASS |
static java.lang.String |
ARG_DELETE_CLUSTER_FILES |
static java.lang.String |
ARG_K |
static java.lang.String |
ARG_MAX_ITERATIONS |
static java.lang.String |
ARG_MAX_MEMBER_DISTANCE |
static java.lang.String |
ARG_SHAPE_GEN_CLASS |
protected double[] |
clustersPoints |
protected java.lang.Class<? extends CriterionFunction> |
critetionFunctionClass |
protected boolean |
deletePreviousClusterFiles |
protected int |
dimensions |
protected org.apache.hadoop.fs.Path |
inClustersPath |
protected int |
iteration |
protected int |
k |
protected int |
maxIterations |
protected double |
maxMemberDistance |
protected java.lang.Class<? extends ClusterShapeGenerator> |
shapeGeneratorClass |
protected org.apache.hadoop.fs.Path |
workDirPath |
Constructor and Description |
---|
KMeansClustering() |
Modifier and Type | Method and Description |
---|---|
void |
configure(org.apache.hadoop.mapred.JobConf conf)
Validates and adds the current parameters to the job configuration
|
protected void |
deletePreviousClustersFiles(org.apache.hadoop.conf.Configuration conf) |
protected void |
generateFinalResultClustersFile(ClusterInfo[] clusters, org.apache.hadoop.conf.Configuration conf) |
protected double[] |
generateInitialKClustersPoints(org.apache.hadoop.conf.Configuration conf) |
double[] |
getClustersPoints()
Gets the initial clusters points
|
java.lang.String |
getCmdOptions()
Gets a description of the arguments expected from command line.
|
java.lang.Class<? extends CriterionFunction> |
getCritetionFunctionClass()
Gets the
CriterionFunction subclass to be used |
org.apache.hadoop.fs.Path |
getInClustersPath()
Gets the path where the input cluster information is located
|
int |
getIteration()
Gets the current iteration number
|
int |
getK()
Gets the number of clusters
|
int |
getMaxIterations()
Gets the maximum number of iterations allowed
|
protected PointWritable[] |
getSampledPoints(org.apache.hadoop.conf.Configuration conf) |
java.lang.Class<? extends ClusterShapeGenerator> |
getShapeGeneratorClass()
Gets the
ClusterShapeGenerator subclass to be used |
org.apache.hadoop.fs.Path |
getWorkDirPath()
Gets the path where the mapreduce output is stored at each iteration
|
boolean |
isDeletePreviousClusterFiles()
Specifies whether the output generated by previous iterations should be removed or not
|
protected boolean |
isNextIteration(int iteration, int convClusters, int maxIterations, java.lang.Class<? extends ClusterShapeGenerator> finalShapeGenClass) |
static void |
main(java.lang.String[] args) |
protected static ClusterInfo[] |
pointsToClusterInfo(double[] centers, int dimensions) |
void |
processArgs(java.lang.String[] args, org.apache.hadoop.conf.Configuration conf)
Extracts and validates arguments from the command line
|
protected static void |
replaceEmptyOutClusters(ClusterInfo[] outClusters, ClusterInfo[] inClusters) |
int |
run(java.lang.String[] args) |
KMeans.KMeansIterationResult |
runIteration(int iteration, org.apache.hadoop.mapred.JobConf baseConf)
Launches a mapreduce job to run a single iteration
|
boolean |
runIterations(int maxIterations, org.apache.hadoop.mapred.JobConf baseConf)
Runs the K Means clustering algorithm using the given configuration
|
void |
setClustersPoints(double[] clustersPoints)
Sets the initial clusters points
|
void |
setCritetionFunctionClass(java.lang.Class<? extends CriterionFunction> critetionFunctionClass)
Sets the
CriterionFunction subclass to be used |
void |
setDeletePreviousClusterFiles(boolean deletePreviousClusterFiles)
Specifies whether the output generated by previous iterations should be removed or not
|
void |
setInClustersPath(org.apache.hadoop.fs.Path inClustersPath)
Sets the path where the input cluster information is located
|
void |
setIteration(int iteration)
Sets the current iteration number
|
void |
setK(int k)
Sets the number of clusters
|
void |
setMaxIterations(int maxIterations)
Sets the maximum number of iterations allowed
|
void |
setOutput(java.lang.String output)
Sets the job output
|
void |
setShapeGeneratorClass(java.lang.Class<? extends ClusterShapeGenerator> shapeGeneratorClass)
Sets the
ClusterShapeGenerator subclass to be used |
protected ClusterInfo[] |
setupNextIteractionClusters(KMeans.KMeansIterationResult iterationResult, ClusterInfo[] previousIterationClusters, org.apache.hadoop.conf.Configuration conf) |
configure, createJob, createJob, createJob, createJob, createJobConf, createJobConf, createJobConf, getInput, getInputFormatClass, getJarClass, getOutput, getRecordInfoProviderClass, getSpatialConfig, setInput, setInputFormatClass, setJarClass, setRecordInfoProviderClass, setSpatialConfig
protected int k
protected int dimensions
protected int iteration
protected double[] clustersPoints
protected org.apache.hadoop.fs.Path inClustersPath
protected boolean deletePreviousClusterFiles
protected int maxIterations
protected org.apache.hadoop.fs.Path workDirPath
protected java.lang.Class<? extends CriterionFunction> critetionFunctionClass
protected java.lang.Class<? extends ClusterShapeGenerator> shapeGeneratorClass
protected double maxMemberDistance
public static final java.lang.String ARG_MAX_MEMBER_DISTANCE
public static final java.lang.String ARG_SHAPE_GEN_CLASS
public static final java.lang.String ARG_CRIT_FUN_CLASS
public static final java.lang.String ARG_DELETE_CLUSTER_FILES
public static final java.lang.String ARG_CLUSTERS_POINTS
public static final java.lang.String ARG_MAX_ITERATIONS
public static final java.lang.String ARG_K
public org.apache.hadoop.fs.Path getWorkDirPath()
public void setOutput(java.lang.String output)
BaseJob
public void setClustersPoints(double[] clustersPoints)
clustersPoints
- an array of point ordinates in the form x1,y1,x2,y2,...,xK,yKpublic double[] getClustersPoints()
public void setInClustersPath(org.apache.hadoop.fs.Path inClustersPath)
inClustersPath
- a pathpublic org.apache.hadoop.fs.Path getInClustersPath()
public void setK(int k)
k
- the number of clusterspublic int getK()
public void setIteration(int iteration)
iteration
- the current iteration numberpublic int getIteration()
public void setDeletePreviousClusterFiles(boolean deletePreviousClusterFiles)
deletePreviousClusterFiles
- true if intermediate output should be removedpublic boolean isDeletePreviousClusterFiles()
public void setMaxIterations(int maxIterations)
maxIterations
- the maximum number of iterations allowedpublic int getMaxIterations()
public void setCritetionFunctionClass(java.lang.Class<? extends CriterionFunction> critetionFunctionClass)
CriterionFunction
subclass to be usedcritetionFunctionClass
- a CriterionFunction
subclasspublic java.lang.Class<? extends CriterionFunction> getCritetionFunctionClass()
CriterionFunction
subclass to be usedCriterionFunction
subclasspublic void setShapeGeneratorClass(java.lang.Class<? extends ClusterShapeGenerator> shapeGeneratorClass)
ClusterShapeGenerator
subclass to be usedshapeGeneratorClass
- a ClusterShapeGenerator
subclasspublic java.lang.Class<? extends ClusterShapeGenerator> getShapeGeneratorClass()
ClusterShapeGenerator
subclass to be usedClusterShapeGenerator
subclasspublic void processArgs(java.lang.String[] args, org.apache.hadoop.conf.Configuration conf) throws java.lang.Exception
BaseJob
processArgs
in class BaseJob<K,V>
args
- arguments from the command lineconf
- the job configurationjava.lang.Exception
public java.lang.String getCmdOptions()
BaseJob
getCmdOptions
in class BaseJob<K,V>
public void configure(org.apache.hadoop.mapred.JobConf conf) throws java.lang.Exception
BaseJob
public KMeans.KMeansIterationResult runIteration(int iteration, org.apache.hadoop.mapred.JobConf baseConf) throws java.lang.Exception
iteration
- the iteration numberbaseConf
- the job configurationKMeans.KMeansIterationResult
containing the results of the iterationjava.lang.Exception
public boolean runIterations(int maxIterations, org.apache.hadoop.mapred.JobConf baseConf) throws java.lang.Exception
maxIterations
- the maximum number of iterations. If zero is passed, a default value based on the number of clusters will be usedbaseConf
- the job configurationjava.lang.Exception
protected PointWritable[] getSampledPoints(org.apache.hadoop.conf.Configuration conf) throws java.io.IOException, java.lang.InterruptedException
java.io.IOException
java.lang.InterruptedException
protected double[] generateInitialKClustersPoints(org.apache.hadoop.conf.Configuration conf) throws java.lang.Exception
java.lang.Exception
protected void generateFinalResultClustersFile(ClusterInfo[] clusters, org.apache.hadoop.conf.Configuration conf) throws java.io.IOException
java.io.IOException
protected void deletePreviousClustersFiles(org.apache.hadoop.conf.Configuration conf)
protected boolean isNextIteration(int iteration, int convClusters, int maxIterations, java.lang.Class<? extends ClusterShapeGenerator> finalShapeGenClass)
protected ClusterInfo[] setupNextIteractionClusters(KMeans.KMeansIterationResult iterationResult, ClusterInfo[] previousIterationClusters, org.apache.hadoop.conf.Configuration conf) throws java.io.IOException
java.io.IOException
protected static ClusterInfo[] pointsToClusterInfo(double[] centers, int dimensions)
protected static void replaceEmptyOutClusters(ClusterInfo[] outClusters, ClusterInfo[] inClusters)
public int run(java.lang.String[] args) throws java.lang.Exception
java.lang.Exception
public static void main(java.lang.String[] args) throws java.lang.Exception
java.lang.Exception