-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathremove_duplicates.py
More file actions
51 lines (45 loc) · 1.8 KB
/
remove_duplicates.py
File metadata and controls
51 lines (45 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# import the necessary packages
from imutils import paths
import numpy as np
import argparse
import cv2
import os
import pandas as pd
def dhash(image, hashSize=8):
# convert the image to grayscale and resize the grayscale image,
# adding a single column (width) so we can compute the horizontal
# gradient
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
resized = cv2.resize(gray, (hashSize + 1, hashSize))
# compute the (relative) horizontal gradient between adjacent
# column pixels
diff = resized[:, 1:] > resized[:, :-1]
# convert the difference image to a hash and return it
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
# grab the paths to all images in our input dataset directory and
# then initialize our hashes dictionary
# imagePaths = list(paths.list_images(args["dataset"]))
# cleaned_data = pd.read_csv('../tweets_scrape/csv_data/data_AsianHate_cleaned.csv',index_col=0)
# imagePaths = cleaned_data['image_path'].to_list()
hashes = {}
def duplicate_detector(imagePaths):
# loop over our image paths
for imagePath in imagePaths:
# print("{}/{} is being processed, {}".format(imagePaths.index(imagePath) +
# 1, len(imagePaths), imagePath))
# load the input image and compute the hash
image = cv2.imread(imagePath)
try:
image = cv2.resize(image, (150, 150))
except cv2.error:
continue
h = dhash(image)
# grab all image paths with that hash, add the current image
# path to it, and store the list back in the hashes dictionary
p = hashes.get(h, [])
p.append(imagePath)
hashes[h] = p
no_duplicate_paths = []
for i, hashedPaths in hashes.items():
no_duplicate_paths.append(hashedPaths[0])
return no_duplicate_paths