K - the type of the input keysV - the type of the input valuespublic class KMeansClustering<K,V> extends BaseJob<K,V>
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
ARG_CLUSTERS_POINTS |
static java.lang.String |
ARG_CRIT_FUN_CLASS |
static java.lang.String |
ARG_DELETE_CLUSTER_FILES |
static java.lang.String |
ARG_K |
static java.lang.String |
ARG_MAX_ITERATIONS |
static java.lang.String |
ARG_MAX_MEMBER_DISTANCE |
static java.lang.String |
ARG_SHAPE_GEN_CLASS |
protected double[] |
clustersPoints |
protected java.lang.Class<? extends CriterionFunction> |
critetionFunctionClass |
protected boolean |
deletePreviousClusterFiles |
protected int |
dimensions |
protected org.apache.hadoop.fs.Path |
inClustersPath |
protected int |
iteration |
protected int |
k |
protected int |
maxIterations |
protected double |
maxMemberDistance |
protected java.lang.Class<? extends ClusterShapeGenerator> |
shapeGeneratorClass |
protected org.apache.hadoop.fs.Path |
workDirPath |
| Constructor and Description |
|---|
KMeansClustering() |
| Modifier and Type | Method and Description |
|---|---|
void |
configure(org.apache.hadoop.mapred.JobConf conf)
Validates and adds the current parameters to the job configuration
|
protected void |
deletePreviousClustersFiles(org.apache.hadoop.conf.Configuration conf) |
protected void |
generateFinalResultClustersFile(ClusterInfo[] clusters, org.apache.hadoop.conf.Configuration conf) |
protected double[] |
generateInitialKClustersPoints(org.apache.hadoop.conf.Configuration conf) |
double[] |
getClustersPoints()
Gets the initial clusters points
|
java.lang.String |
getCmdOptions()
Gets a description of the arguments expected from command line.
|
java.lang.Class<? extends CriterionFunction> |
getCritetionFunctionClass()
Gets the
CriterionFunction subclass to be used |
org.apache.hadoop.fs.Path |
getInClustersPath()
Gets the path where the input cluster information is located
|
int |
getIteration()
Gets the current iteration number
|
int |
getK()
Gets the number of clusters
|
int |
getMaxIterations()
Gets the maximum number of iterations allowed
|
protected PointWritable[] |
getSampledPoints(org.apache.hadoop.conf.Configuration conf) |
java.lang.Class<? extends ClusterShapeGenerator> |
getShapeGeneratorClass()
Gets the
ClusterShapeGenerator subclass to be used |
org.apache.hadoop.fs.Path |
getWorkDirPath()
Gets the path where the mapreduce output is stored at each iteration
|
boolean |
isDeletePreviousClusterFiles()
Specifies whether the output generated by previous iterations should be removed or not
|
protected boolean |
isNextIteration(int iteration, int convClusters, int maxIterations, java.lang.Class<? extends ClusterShapeGenerator> finalShapeGenClass) |
static void |
main(java.lang.String[] args) |
protected static ClusterInfo[] |
pointsToClusterInfo(double[] centers, int dimensions) |
void |
processArgs(java.lang.String[] args, org.apache.hadoop.conf.Configuration conf)
Extracts and validates arguments from the command line
|
protected static void |
replaceEmptyOutClusters(ClusterInfo[] outClusters, ClusterInfo[] inClusters) |
int |
run(java.lang.String[] args) |
KMeans.KMeansIterationResult |
runIteration(int iteration, org.apache.hadoop.mapred.JobConf baseConf)
Launches a mapreduce job to run a single iteration
|
boolean |
runIterations(int maxIterations, org.apache.hadoop.mapred.JobConf baseConf)
Runs the K Means clustering algorithm using the given configuration
|
void |
setClustersPoints(double[] clustersPoints)
Sets the initial clusters points
|
void |
setCritetionFunctionClass(java.lang.Class<? extends CriterionFunction> critetionFunctionClass)
Sets the
CriterionFunction subclass to be used |
void |
setDeletePreviousClusterFiles(boolean deletePreviousClusterFiles)
Specifies whether the output generated by previous iterations should be removed or not
|
void |
setInClustersPath(org.apache.hadoop.fs.Path inClustersPath)
Sets the path where the input cluster information is located
|
void |
setIteration(int iteration)
Sets the current iteration number
|
void |
setK(int k)
Sets the number of clusters
|
void |
setMaxIterations(int maxIterations)
Sets the maximum number of iterations allowed
|
void |
setOutput(java.lang.String output)
Sets the job output
|
void |
setShapeGeneratorClass(java.lang.Class<? extends ClusterShapeGenerator> shapeGeneratorClass)
Sets the
ClusterShapeGenerator subclass to be used |
protected ClusterInfo[] |
setupNextIteractionClusters(KMeans.KMeansIterationResult iterationResult, ClusterInfo[] previousIterationClusters, org.apache.hadoop.conf.Configuration conf) |
configure, createJob, createJob, createJob, createJob, createJobConf, createJobConf, createJobConf, getInput, getInputFormatClass, getJarClass, getOutput, getRecordInfoProviderClass, getSpatialConfig, setInput, setInputFormatClass, setJarClass, setRecordInfoProviderClass, setSpatialConfigprotected int k
protected int dimensions
protected int iteration
protected double[] clustersPoints
protected org.apache.hadoop.fs.Path inClustersPath
protected boolean deletePreviousClusterFiles
protected int maxIterations
protected org.apache.hadoop.fs.Path workDirPath
protected java.lang.Class<? extends CriterionFunction> critetionFunctionClass
protected java.lang.Class<? extends ClusterShapeGenerator> shapeGeneratorClass
protected double maxMemberDistance
public static final java.lang.String ARG_MAX_MEMBER_DISTANCE
public static final java.lang.String ARG_SHAPE_GEN_CLASS
public static final java.lang.String ARG_CRIT_FUN_CLASS
public static final java.lang.String ARG_DELETE_CLUSTER_FILES
public static final java.lang.String ARG_CLUSTERS_POINTS
public static final java.lang.String ARG_MAX_ITERATIONS
public static final java.lang.String ARG_K
public org.apache.hadoop.fs.Path getWorkDirPath()
public void setOutput(java.lang.String output)
BaseJobpublic void setClustersPoints(double[] clustersPoints)
clustersPoints - an array of point ordinates in the form x1,y1,x2,y2,...,xK,yKpublic double[] getClustersPoints()
public void setInClustersPath(org.apache.hadoop.fs.Path inClustersPath)
inClustersPath - a pathpublic org.apache.hadoop.fs.Path getInClustersPath()
public void setK(int k)
k - the number of clusterspublic int getK()
public void setIteration(int iteration)
iteration - the current iteration numberpublic int getIteration()
public void setDeletePreviousClusterFiles(boolean deletePreviousClusterFiles)
deletePreviousClusterFiles - true if intermediate output should be removedpublic boolean isDeletePreviousClusterFiles()
public void setMaxIterations(int maxIterations)
maxIterations - the maximum number of iterations allowedpublic int getMaxIterations()
public void setCritetionFunctionClass(java.lang.Class<? extends CriterionFunction> critetionFunctionClass)
CriterionFunction subclass to be usedcritetionFunctionClass - a CriterionFunction subclasspublic java.lang.Class<? extends CriterionFunction> getCritetionFunctionClass()
CriterionFunction subclass to be usedCriterionFunction subclasspublic void setShapeGeneratorClass(java.lang.Class<? extends ClusterShapeGenerator> shapeGeneratorClass)
ClusterShapeGenerator subclass to be usedshapeGeneratorClass - a ClusterShapeGenerator subclasspublic java.lang.Class<? extends ClusterShapeGenerator> getShapeGeneratorClass()
ClusterShapeGenerator subclass to be usedClusterShapeGenerator subclass
public void processArgs(java.lang.String[] args,
org.apache.hadoop.conf.Configuration conf)
throws java.lang.Exception
BaseJobprocessArgs in class BaseJob<K,V>args - arguments from the command lineconf - the job configurationjava.lang.Exceptionpublic java.lang.String getCmdOptions()
BaseJobgetCmdOptions in class BaseJob<K,V>
public void configure(org.apache.hadoop.mapred.JobConf conf)
throws java.lang.Exception
BaseJobpublic KMeans.KMeansIterationResult runIteration(int iteration, org.apache.hadoop.mapred.JobConf baseConf) throws java.lang.Exception
iteration - the iteration numberbaseConf - the job configurationKMeans.KMeansIterationResult containing the results of the iterationjava.lang.Exception
public boolean runIterations(int maxIterations,
org.apache.hadoop.mapred.JobConf baseConf)
throws java.lang.Exception
maxIterations - the maximum number of iterations. If zero is passed, a default value based on the number of clusters will be usedbaseConf - the job configurationjava.lang.Exceptionprotected PointWritable[] getSampledPoints(org.apache.hadoop.conf.Configuration conf) throws java.io.IOException, java.lang.InterruptedException
java.io.IOExceptionjava.lang.InterruptedException
protected double[] generateInitialKClustersPoints(org.apache.hadoop.conf.Configuration conf)
throws java.lang.Exception
java.lang.Exceptionprotected void generateFinalResultClustersFile(ClusterInfo[] clusters, org.apache.hadoop.conf.Configuration conf) throws java.io.IOException
java.io.IOExceptionprotected void deletePreviousClustersFiles(org.apache.hadoop.conf.Configuration conf)
protected boolean isNextIteration(int iteration,
int convClusters,
int maxIterations,
java.lang.Class<? extends ClusterShapeGenerator> finalShapeGenClass)
protected ClusterInfo[] setupNextIteractionClusters(KMeans.KMeansIterationResult iterationResult, ClusterInfo[] previousIterationClusters, org.apache.hadoop.conf.Configuration conf) throws java.io.IOException
java.io.IOExceptionprotected static ClusterInfo[] pointsToClusterInfo(double[] centers, int dimensions)
protected static void replaceEmptyOutClusters(ClusterInfo[] outClusters, ClusterInfo[] inClusters)
public int run(java.lang.String[] args)
throws java.lang.Exception
java.lang.Exception
public static void main(java.lang.String[] args)
throws java.lang.Exception
java.lang.Exception