美文网首页
k-means and clustering text docu

k-means and clustering text docu

作者: whenitsallover | 来源:发表于2018-04-04 14:26 被阅读0次

    http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py

    Two feature extraction methods
    TfidfVectorizer
    HashingVectorizer 
    

    Two algorithms are demoed: ordinary k-means and its more scalable cousin minibatch k-means.

    from __future__ import print_function
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.decomposition import TruncatedSVD
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import Normalizer
    from sklearn import metrics
    from sklearn.cluster import KMeans, MiniBatchKMeans
    import logging
    from optparse import OptionParser
    import sys
    from time import time
    
    import numpy as np
    
    
    
    
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')  # 记录日志
    
    # parse commandline arguments
    op = OptionParser()  # 得到OptionParser实例对象
    op.add_option("--lsa",
                  dest="n_components", type="int",
                  help="Preprocess documents with latent semantic analysis.")
    op.add_option("--no-minibatch",
                  action="store_false", dest="minibatch", default=True,
                  help="Use ordinary k-means algorithm (in batch mode).")
    op.add_option("--no-idf",
                  action="store_false", dest="use_idf", default=True,
                  help="Disable Inverse Document Frequency feature weighting.")
    op.add_option("--use-hashing",
                  action="store_true", default=False,
                  help="Use a hashing feature vectorizer")
    op.add_option("--n-features", type=int, default=10000,
                  help="Maximum number of features (dimensions)"
                       " to extract from text.")
    op.add_option("--verbose",
                  action="store_true", dest="verbose", default=False,
                  help="Print progress reports inside k-means algorithm.")
    
    # 执行其父类中的add_option(*args,**kwargs)方法
    print(__doc__)
    op.print_help()
    
    
    def is_interactive():
        return not hasattr(sys.modules['__main__'], '__file__')
    
    # work-around for Jupyter notebook and IPython console
    argv = [] if is_interactive() else sys.argv[1:]
    (opts, args) = op.parse_args(argv)
    if len(args) > 0:
        op.error("this script takes no arguments.")
        sys.exit(1)
    
    
    # #############################################################################
    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
    # Uncomment the following to do the analysis on all the categories
    # categories = None
    
    print("Loading 20 newsgroups dataset for categories:")
    print(categories)
    
    dataset = fetch_20newsgroups(subset='all', categories=categories,
                                 shuffle=True, random_state=42)
    # fetch_20newsgroups() Load the filenames and data from the 20 newsgroups dataset.
    
    print("%d documents" % len(dataset.data))
    print("%d categories" % len(dataset.target_names))
    print()
    
    labels = dataset.target
    true_k = np.unique(labels).shape[0]
    #  Find the unique elements of an array.
    '''
    Returns the sorted unique elements of an array. There are three optional
    outputs in addition to the unique elements: the indices of the input array
    that give the unique values, the indices of the unique array that
    reconstruct the input array, and the number of times each unique value
    comes up in the input array.
    '''
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        if opts.use_idf:
            # Perform an IDF normalization on the output of HashingVectorizer
            hasher = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english', alternate_sign=False,
                                       norm=None, binary=False)
            vectorizer = make_pipeline(hasher, TfidfTransformer())
            """Construct a Pipeline from the given estimators.
    
                This is a shorthand for the Pipeline constructor; it does not require, and
                does not permit, naming the estimators. Instead, their names will be set
                to the lowercase of their types automatically.
            """
    
        else:
            vectorizer = HashingVectorizer(n_features=opts.n_features,
                                           stop_words='english',
                                           alternate_sign=False, norm='l2',
                                           binary=False)
    else:
        vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
                                     min_df=2, stop_words='english',
                                     use_idf=opts.use_idf)
    X = vectorizer.fit_transform(dataset.data)
    
    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)
    print()
    
    if opts.n_components:
        print("Performing dimensionality reduction using LSA")
        t0 = time()
        # Vectorizer results are normalized, which makes KMeans behave as
        # spherical k-means for better results. Since LSA/SVD results are
        # not normalized, we have to redo the normalization.
        svd = TruncatedSVD(opts.n_components)
    
    
        normalizer = Normalizer(copy=False)
        '''
        
        Each sample (i.e. each row of the data matrix) with at least one
        non zero component is rescaled independently of other samples so
        that its norm (l1 or l2) equals one.
    
        This transformer is able to work both with dense numpy arrays and
        scipy.sparse matrix (use CSR format if you want to avoid the burden of
        a copy / conversion).
    
        Scaling inputs to unit norms is a common operation for text
        classification or clustering for instance. For instance the dot
        product of two l2-normalized TF-IDF vectors is the cosine similarity
        of the vectors and is the base similarity metric for the Vector
        Space Model commonly used by the Information Retrieval community.
    
        '''
        lsa = make_pipeline(svd, normalizer)
    
        X = lsa.fit_transform(X)
    
        print("done in %fs" % (time() - t0))
    
        explained_variance = svd.explained_variance_ratio_.sum()
        print("Explained variance of the SVD step: {}%".format(
            int(explained_variance * 100)))
    
        print()
    
    
    # #############################################################################
    # Do the actual clustering
    
    if opts.minibatch:
        km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                             init_size=1000, batch_size=1000, verbose=opts.verbose)
    else:
        km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                    verbose=opts.verbose)
    
    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()
    
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f"
          % metrics.adjusted_rand_score(labels, km.labels_))
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, km.labels_, sample_size=1000))
    
    print()
    
    
    if not opts.use_hashing:
        print("Top terms per cluster:")
    
        if opts.n_components:
            original_space_centroids = svd.inverse_transform(km.cluster_centers_)
            order_centroids = original_space_centroids.argsort()[:, ::-1]
        else:
            order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    
        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("Cluster %d:" % i, end='')
            for ind in order_centroids[i, :10]:
                print(' %s' % terms[ind], end='')
            print()
    

    相关文章

      网友评论

          本文标题:k-means and clustering text docu

          本文链接:https://www.haomeiwen.com/subject/uzmkhftx.html