Design Interviews: Problems and Solutions: City Planning using k-means clustering algorithm

Problem

Given a city with coordinates of n houses, find the most optimal location for k hospitals so that the mean distance required to be traveled by the residents of the city is minimum.

Input

n (0 < n <= 100)

(x, y) coordinates of n houses

k (0 < n <= 5)

Output

k coordinates representing the locations of the hospitals.

Solution

This problem can be solved by using the k-means clustering algorithm which involves finding clusters in a scatter plot based on the condition that the mean distance of the points in a cluster from the cluster centroid is minimum

Python Code

import math
import random
import matplotlib.pyplot as plt
import matplotlib.collections as mcoll
import time

def generate_random_points(n):
    points = []
    for _ in range(n):
        x = random.uniform(0, 100)
        y = random.uniform(0, 100)
        points.append((x, y))
    return points

def calculate_mean_distance(points, centroids):
    total_distance = 0
    for x, y in points:
        min_distance = float('inf')
        for cx, cy in centroids:
            distance = math.sqrt((x - cx) ** 2 + (y - cy) ** 2)
            min_distance = min(min_distance, distance)
        total_distance += min_distance
    return total_distance / len(points)

'''
This method starts with k centroids randomly chosen from the given coordinates.
It then 
'''
def k_means(points, k):
    centroids = random.sample(points, k)
    iterations = 0
    while True:
        iterations += 1
        clusters = [[] for _ in range(k)]
        for x, y in points:
            min_distance = float('inf')
            closest_centroid = None
            for i, (cx, cy) in enumerate(centroids):
                distance = math.sqrt((x - cx) ** 2 + (y - cy) ** 2)
                if distance < min_distance:
                    min_distance = distance
                    closest_centroid = i
            clusters[closest_centroid].append((x, y))
        new_centroids = []
        for cluster in clusters:
            x_sum = sum(x for x, y in cluster)
            y_sum = sum(y for x, y in cluster)
            new_centroids.append((x_sum / len(cluster), y_sum / len(cluster)))
        if new_centroids == centroids:
            break
        centroids = new_centroids
        plot_iteration(points, centroids, clusters, iterations)
        time.sleep(1)  # Pause for 1 second
    return centroids, clusters

def plot_iteration(points, centroids, clusters, iteration):
    plt.clf()  # Clear the previous plot
    colors = ['b', 'g', 'r', 'c', 'm']  # Colors for clusters

    # Plot the random points
    x_coords, y_coords = zip(*points)
    plt.scatter(x_coords, y_coords, c='k', marker='o', s=10, alpha=0.5, label='Random Points')

    # Plot the centroids
    centroid_x, centroid_y = zip(*centroids)
    plt.scatter(centroid_x, centroid_y, c='r', marker='*', s=100, label='Centroids')

    # Plot the line segments and clusters
    for i, cluster in enumerate(clusters):
        x_coords, y_coords = zip(*cluster)
        plt.scatter(x_coords, y_coords, c=colors[i], marker='o', label=f'Cluster {i+1}', alpha=0.5)
        line_segments = []
        for x, y in cluster:
            line_segments.append([(x, y), (centroids[i][0], centroids[i][1])])
        line_collection = mcoll.LineCollection(line_segments, colors=colors[i], linewidths=0.5, alpha=0.5)
        plt.gca().add_collection(line_collection)

    plt.xlim(0, 100)
    plt.ylim(0, 100)
    plt.title(f'Iteration {iteration}')
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.grid(True)
    plt.legend()
    plt.pause(0.01)  # Pause for a brief moment to update the plot

# Example usage
n = 100  # Number of random points
k = 5    # Number of centroids to find
points = generate_random_points(n)
centroids, clusters = k_means(points, k)
mean_distance = calculate_mean_distance(points, centroids)
print(f"Mean distance of {k} centroids from {n} points: {mean_distance:.2f}")

# Plot the final points, centroids, and line segments
plt.figure(figsize=(8, 6))
colors = ['b', 'g', 'r', 'c', 'm']  # Colors for clusters

for i, cluster in enumerate(clusters):
    x_coords, y_coords = zip(*cluster)
    plt.scatter(x_coords, y_coords, c=colors[i], marker='o', label=f'Cluster {i+1}', alpha=0.5)
    centroid_x, centroid_y = centroids[i]
    plt.scatter(centroid_x, centroid_y, c='k', marker='*', s=100)
    line_segments = []
    for x, y in cluster:
        line_segments.append([(x, y), (centroid_x, centroid_y)])
    line_collection = mcoll.LineCollection(line_segments, colors=colors[i], linewidths=0.5, alpha=0.5)
    plt.gca().add_collection(line_collection)

plt.xlim(0, 100)
plt.ylim(0, 100)
plt.title('Random Points, Centroids, and Line Segments')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(True)
plt.legend()
plt.show()

The above code requires matplotlib library to be installed.

Scatter Plot

The circles represent the coordinates of the houses, stars represent the cluster centroids (or hospitals) and the line segments represent the nearest centroid.

Design Interviews: Problems and Solutions

Pages

Sunday, April 21, 2024

City Planning using k-means clustering algorithm