From aac2eb6994cd270de78f0b11d69ecd718b88834a Mon Sep 17 00:00:00 2001 From: Ryan Kirkman Date: Wed, 28 Oct 2015 18:31:41 -0700 Subject: [PATCH 1/4] Initial commit of JSON response support --- README.md | 33 +------ config_schema.json | 5 +- htmldiff2.py | 221 +++++++++++++++++++++++++++++++-------------- 3 files changed, 156 insertions(+), 103 deletions(-) diff --git a/README.md b/README.md index 30225bf..10be5c0 100644 --- a/README.md +++ b/README.md @@ -20,35 +20,4 @@ optional arguments: --debug disable threading for debug purposes ``` -JSON config file schema: -```JSON -{ - "title": "htmldiff config", - "type": "object", - "properties": { - "servers": { - "type": "array", - "items": { - "type": "object", - "properties": { - "base_url": {"type": "string"}, - "auth": { - "type": "array", - "items": { "type": "string" } - }, - "protocol": {"type" : "string"} - }, - "required": ["base_url"] - } - }, - "relative_urls": { - "type": "array", - "minItems": 1, - "items": { "type": "string" }, - "uniqueItems": true - }, - "selectors": { "type": "object" } - }, - "required": ["servers", "relative_urls", "selectors"] -} -``` +JSON config file schema: see `config_schema.json` \ No newline at end of file diff --git a/config_schema.json b/config_schema.json index fa9b2f6..39de39e 100644 --- a/config_schema.json +++ b/config_schema.json @@ -23,7 +23,8 @@ "items": { "type": "string" }, "uniqueItems": true }, - "selectors": { "type": "object" } + "selectors": { "type": "object" }, + "keys": { "type": "array" } }, - "required": ["servers", "relative_urls", "selectors"] + "required": ["servers", "relative_urls"] } diff --git a/htmldiff2.py b/htmldiff2.py index 4837231..eb10ad5 100755 --- a/htmldiff2.py +++ b/htmldiff2.py @@ -6,7 +6,6 @@ import functools import json from multiprocessing.dummy import Pool as ThreadPool -import sys import jsonschema import lxml.html @@ -17,100 +16,182 @@ class Server(object): - def __init__(self, base_url, protocol='http', auth=None): self.base_url = base_url self.protocol = protocol self.auth = tuple(auth) if auth else None + def __str__(self): + return self.get_url() + + @staticmethod + def compare_pages( + relative_urls, servers, html, json, threads=1, debug=False, **kwargs): + """ + relative_urls: list of str URLs + servers: list of Server objects + html: Boolean for HTML response type + json: Boolean for JSON response type + """ + _servers = [ + Server.factory(html=html, json=json, **server_config) + for server_config in servers] + func = functools.partial(_servers[0].compare_page, servers=_servers, **kwargs) + if debug: + differences = map(func, relative_urls) + else: + pool = ThreadPool(threads) + differences = pool.map(func, relative_urls) + pool.close() + pool.join() + + # Flatten the list + return reduce(lambda x, y: x + y, differences) + + @staticmethod + def factory(html=None, json=None, **config): + if html: + return HtmlServer(**config) + elif json: + return JsonServer(**config) + + raise Exception('Factory failed to create class') + def get_full_url(self, relative_url): return "{}://{}{}".format(self.protocol, self.base_url, relative_url) - def get_text_response(self, relative_url): - url = self.get_full_url(relative_url) - r = requests.get(url, auth=self.auth) - if r.status_code != 200: - raise Exception("Got status code {} for URL {}".format(r.status_code, url)) - r.encoding = 'utf-8' - return r.text - def get_dom_tree(self, relative_url): - """ Build the DOM Tree """ - return lxml.html.fromstring(self.get_text_response(relative_url)) +class HtmlServer(Server): + def __init__(self, base_url, protocol='http', auth=None): + Server.__init__(self, base_url, protocol, auth) - def __str__(self): - return self.get_url() + @staticmethod + def compare_page(relative_url, servers, selectors): + differences = [] + trees = OrderedDict() + for server in servers: + trees[server.get_full_url(relative_url)] = server.get_dom_tree(relative_url) -def get_text_from_tree(tree, selector, strip_whitespace=True): - # construct a CSS Selector - sel = CSSSelector(selector) + for selector_name, selector in selectors.iteritems(): + results = [HtmlServer.get_text_from_tree(tree, selector) for _, tree in trees.iteritems()] - # Apply the selector to the DOM tree. - results = sel(tree) + # If all results are equal, if we construct a set from the results, + # the length of the set should be 1 + if len(set(results)) != 1: + differences.append( + HtmlServer.mismatched_error_message(relative_url, selector_name, selector, trees, results)) - # Return an empty string for diffing if we match nothing - if len(results) < 1: - return '' + return differences - # get the html out of all the results - data = [lxml.html.tostring(result) for result in results] + @staticmethod + def mismatched_error_message(relative_url, selector_name, selector, trees, results): + msg = [] + msg.append("-------------------------") + msg.append("Error: mismatched results") + for url, _ in trees.iteritems(): + msg.append(" - {}".format(url)) + msg.append("Selector name: {}".format(selector_name)) + msg.append("Selector: {}".format(selector)) + msg.append("") + msg.append('\n'.join(difflib.ndiff(results[0].splitlines(), results[1].splitlines()))) - if strip_whitespace: - data = [result.strip() for result in data] + return '\n'.join(msg) - return data[0] + @staticmethod + def get_text_from_tree(tree, selector, strip_whitespace=True): + # construct a CSS Selector + sel = CSSSelector(selector) + # Apply the selector to the DOM tree. + results = sel(tree) -def mismatched_error_message(relative_url, selector_name, selector, trees, results): - msg = [] - msg.append("-------------------------") - msg.append("Error: mismatched results") - for url, _ in trees.iteritems(): - msg.append(" - {}".format(url)) - msg.append("Selector name: {}".format(selector_name)) - msg.append("Selector: {}".format(selector)) - msg.append("") - msg.append('\n'.join(difflib.ndiff(results[0].splitlines(), results[1].splitlines()))) + # Return an empty string for diffing if we match nothing + if len(results) < 1: + return '' - return '\n'.join(msg) + # get the html out of all the results + data = [lxml.html.tostring(result) for result in results] + if strip_whitespace: + data = [result.strip() for result in data] -def compare_page(relative_url, servers, selectors): - differences = [] + return data[0] + + def get_dom_tree(self, relative_url): + """ Build the DOM Tree """ + return lxml.html.fromstring(self.get_response(relative_url)) - trees = OrderedDict() - for server in servers: - trees[server.get_full_url(relative_url)] = server.get_dom_tree(relative_url) + def get_response(self, relative_url): + url = self.get_full_url(relative_url) + r = requests.get(url, auth=self.auth) + if r.status_code != 200: + raise Exception("Got status code {} for URL {}".format(r.status_code, url)) + r.encoding = 'utf-8' + return r.text - for selector_name, selector in selectors.iteritems(): - results = [get_text_from_tree(tree, selector) for _, tree in trees.iteritems()] - # If all results are equal, if we construct a set from the results, - # the length of the set should be 1 - if len(set(results)) != 1: - differences.append(mismatched_error_message(relative_url, selector_name, selector, trees, results)) +class JsonServer(Server): + def __init__(self, base_url, protocol='http', auth=None): + Server.__init__(self, base_url, protocol, auth) - return differences + @staticmethod + def compare_page(relative_url, servers, keys=None): + differences = [] + server_responses = OrderedDict() + for server in servers: + server_responses[server.get_full_url(relative_url)] = server.get_response(relative_url) -def compare_pages(relative_urls, servers, selectors, threads=1, debug=False): - """ - relative_urls: list of str URLs - selectors: dict of selecton_name, str CSS selector - servers: list of Server objects - """ - func = functools.partial(compare_page, servers=servers, selectors=selectors) - if debug: - differences = map(func, relative_urls) - else: - pool = ThreadPool(threads) - differences = pool.map(func, relative_urls) - pool.close() - pool.join() + results = [] + for _, response in server_responses.iteritems(): + results.append(json.dumps( + JsonServer.pluck(response, keys), + sort_keys=True, + indent=4)) - # Flatten the list - return reduce(lambda x, y: x + y, differences) + # If all results are equal, if we construct a set from the results, + # the length of the set should be 1 + if len(set(results)) != 1: + differences.append( + JsonServer.mismatched_error_message(relative_url, results)) + + return differences + + @staticmethod + def pluck(json_obj, keys=None): + if not keys or not isinstance(keys, list): + return json_obj + + plucked = {} + for key in keys: + split = key.split('.') + temp_obj = json_obj + for i in xrange(len(split)): + temp = temp_obj.get(split[i]) + if temp: + temp_obj = temp + else: + break + plucked[key] = temp_obj + + return plucked + + @staticmethod + def mismatched_error_message(relative_url, results): + msg = [] + msg.append("-------------------------") + msg.append("Error: mismatched results") + msg.append("") + msg.append('\n'.join(difflib.ndiff(results[0].splitlines(), results[1].splitlines()))) + return '\n'.join(msg) + + def get_response(self, relative_url): + url = self.get_full_url(relative_url) + r = requests.get(url, auth=self.auth) + if r.status_code != 200: + raise Exception("Got status code {} for URL {}".format(r.status_code, url)) + return r.json() def parse_args(): @@ -128,6 +209,9 @@ def parse_args(): parser.add_argument("--show-config-format", help="show the config format", action="store_true") parser.add_argument("-t", "--threads", type=int, default=1, help="set the number of threads") parser.add_argument("--debug", help="disable threading for debug purposes", action="store_true") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--html", help="Parse responses as HTML", action="store_true") + group.add_argument("--json", help="Parse responses as JSON", action="store_true") return parser.parse_args() @@ -136,15 +220,14 @@ def parse_config_file(filename): config_schema = json.load(config_schema_f) config = json.load(config_f) jsonschema.validate(config, config_schema) - config['servers'] = [Server(**server_config) for server_config in config['servers']] return config if __name__ == "__main__": args = parse_args() config = parse_config_file(args.config) - differences = compare_pages(threads=args.threads, debug=args.debug, **config) + differences = Server.compare_pages(threads=args.threads, debug=args.debug, html=args.html, json=args.json, **config) print "Number of differences: {}".format(len(differences)) for difference in differences: - print difference + print difference.encode('utf-8') From 18af8ffbc9ff6d379967260d4f1ca48c6f6549b1 Mon Sep 17 00:00:00 2001 From: Ryan Kirkman Date: Tue, 3 Nov 2015 15:20:45 -0800 Subject: [PATCH 2/4] Refactor and implement optional exception on non-200 response --- htmldiff2.py | 75 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/htmldiff2.py b/htmldiff2.py index eb10ad5..6e0a30a 100755 --- a/htmldiff2.py +++ b/htmldiff2.py @@ -16,8 +16,9 @@ class Server(object): - def __init__(self, base_url, protocol='http', auth=None): + def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None): self.base_url = base_url + self.ignore_non_200 = ignore_non_200 self.protocol = protocol self.auth = tuple(auth) if auth else None @@ -26,7 +27,7 @@ def __str__(self): @staticmethod def compare_pages( - relative_urls, servers, html, json, threads=1, debug=False, **kwargs): + relative_urls, servers, html, json, ignore_non_200=False, threads=1, debug=False, **kwargs): """ relative_urls: list of str URLs servers: list of Server objects @@ -34,7 +35,7 @@ def compare_pages( json: Boolean for JSON response type """ _servers = [ - Server.factory(html=html, json=json, **server_config) + Server.factory(html=html, json=json, ignore_non_200=ignore_non_200, **server_config) for server_config in servers] func = functools.partial(_servers[0].compare_page, servers=_servers, **kwargs) if debug: @@ -60,10 +61,20 @@ def factory(html=None, json=None, **config): def get_full_url(self, relative_url): return "{}://{}{}".format(self.protocol, self.base_url, relative_url) + def get_base_response(self, relative_url): + url = self.get_full_url(relative_url) + r = requests.get(url, auth=self.auth) + if r.status_code != 200: + if self.ignore_non_200: + return None + else: + raise Exception("Got status code {} for URL {}".format(r.status_code, url)) + return r + class HtmlServer(Server): - def __init__(self, base_url, protocol='http', auth=None): - Server.__init__(self, base_url, protocol, auth) + def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None): + Server.__init__(self, base_url, ignore_non_200, protocol, auth) @staticmethod def compare_page(relative_url, servers, selectors): @@ -71,7 +82,13 @@ def compare_page(relative_url, servers, selectors): trees = OrderedDict() for server in servers: - trees[server.get_full_url(relative_url)] = server.get_dom_tree(relative_url) + response = server.get_dom_tree(relative_url) + if not response: + # Early out for None server response + url = server.get_full_url(relative_url) + return ['Failed to retreive URL: {}'.format(url)] + # return [] + trees[server.get_full_url(relative_url)] = response for selector_name, selector in selectors.iteritems(): results = [HtmlServer.get_text_from_tree(tree, selector) for _, tree in trees.iteritems()] @@ -88,7 +105,7 @@ def compare_page(relative_url, servers, selectors): def mismatched_error_message(relative_url, selector_name, selector, trees, results): msg = [] msg.append("-------------------------") - msg.append("Error: mismatched results") + msg.append("Error - mismatched results for: {}".format(relative_url)) for url, _ in trees.iteritems(): msg.append(" - {}".format(url)) msg.append("Selector name: {}".format(selector_name)) @@ -111,29 +128,29 @@ def get_text_from_tree(tree, selector, strip_whitespace=True): return '' # get the html out of all the results - data = [lxml.html.tostring(result) for result in results] + data = [result.text for result in results] if strip_whitespace: - data = [result.strip() for result in data] + data = [result.strip() if isinstance(result, basestring) else None for result in data] return data[0] def get_dom_tree(self, relative_url): """ Build the DOM Tree """ - return lxml.html.fromstring(self.get_response(relative_url)) + response = self.get_response(relative_url) + return lxml.html.fromstring(response) if response else None def get_response(self, relative_url): - url = self.get_full_url(relative_url) - r = requests.get(url, auth=self.auth) - if r.status_code != 200: - raise Exception("Got status code {} for URL {}".format(r.status_code, url)) + r = Server.get_base_response(self, relative_url) + if not r: + return None r.encoding = 'utf-8' return r.text class JsonServer(Server): - def __init__(self, base_url, protocol='http', auth=None): - Server.__init__(self, base_url, protocol, auth) + def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None): + Server.__init__(self, base_url, ignore_non_200, protocol, auth) @staticmethod def compare_page(relative_url, servers, keys=None): @@ -141,7 +158,13 @@ def compare_page(relative_url, servers, keys=None): server_responses = OrderedDict() for server in servers: - server_responses[server.get_full_url(relative_url)] = server.get_response(relative_url) + response = server.get_response(relative_url) + if not response: + # Early out for None server response + url = server.get_full_url(relative_url) + return ['Failed to retreive URL: {}'.format(url)] + # return [] + server_responses[server.get_full_url(relative_url)] = response results = [] for _, response in server_responses.iteritems(): @@ -181,16 +204,15 @@ def pluck(json_obj, keys=None): def mismatched_error_message(relative_url, results): msg = [] msg.append("-------------------------") - msg.append("Error: mismatched results") + msg.append("Error - mismatched results for url: {}".format(relative_url)) msg.append("") msg.append('\n'.join(difflib.ndiff(results[0].splitlines(), results[1].splitlines()))) return '\n'.join(msg) def get_response(self, relative_url): - url = self.get_full_url(relative_url) - r = requests.get(url, auth=self.auth) - if r.status_code != 200: - raise Exception("Got status code {} for URL {}".format(r.status_code, url)) + r = Server.get_base_response(self, relative_url) + if not r: + return None return r.json() @@ -209,6 +231,7 @@ def parse_args(): parser.add_argument("--show-config-format", help="show the config format", action="store_true") parser.add_argument("-t", "--threads", type=int, default=1, help="set the number of threads") parser.add_argument("--debug", help="disable threading for debug purposes", action="store_true") + parser.add_argument("--ignore-non-200", help="ignore responses that aren't 200 OK", action="store_true") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--html", help="Parse responses as HTML", action="store_true") group.add_argument("--json", help="Parse responses as JSON", action="store_true") @@ -226,7 +249,13 @@ def parse_config_file(filename): if __name__ == "__main__": args = parse_args() config = parse_config_file(args.config) - differences = Server.compare_pages(threads=args.threads, debug=args.debug, html=args.html, json=args.json, **config) + differences = Server.compare_pages( + threads=args.threads, + debug=args.debug, + html=args.html, + json=args.json, + ignore_non_200=args.ignore_non_200, + **config) print "Number of differences: {}".format(len(differences)) for difference in differences: From bbb2f6bcee6a1ba3a2810ecd0e4b08e01409e280 Mon Sep 17 00:00:00 2001 From: Ryan Kirkman Date: Thu, 3 Dec 2015 17:11:39 -0800 Subject: [PATCH 3/4] Implement headers --- config_schema.json | 11 ++++++----- htmldiff2.py | 13 +++++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/config_schema.json b/config_schema.json index 39de39e..f1c2554 100644 --- a/config_schema.json +++ b/config_schema.json @@ -10,9 +10,10 @@ "base_url": {"type": "string"}, "auth": { "type": "array", - "items": { "type": "string" } + "items": {"type": "string"} }, - "protocol": {"type" : "string"} + "protocol": {"type" : "string"}, + "headers": {"type": "object"} }, "required": ["base_url"] } @@ -20,11 +21,11 @@ "relative_urls": { "type": "array", "minItems": 1, - "items": { "type": "string" }, + "items": {"type": "string"}, "uniqueItems": true }, - "selectors": { "type": "object" }, - "keys": { "type": "array" } + "selectors": {"type": "object"}, + "keys": {"type": "array"} }, "required": ["servers", "relative_urls"] } diff --git a/htmldiff2.py b/htmldiff2.py index 6e0a30a..1beb8d4 100755 --- a/htmldiff2.py +++ b/htmldiff2.py @@ -16,11 +16,12 @@ class Server(object): - def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None): + def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None, headers=None): self.base_url = base_url self.ignore_non_200 = ignore_non_200 self.protocol = protocol self.auth = tuple(auth) if auth else None + self.headers = headers def __str__(self): return self.get_url() @@ -63,7 +64,7 @@ def get_full_url(self, relative_url): def get_base_response(self, relative_url): url = self.get_full_url(relative_url) - r = requests.get(url, auth=self.auth) + r = requests.get(url, auth=self.auth, headers=self.headers) if r.status_code != 200: if self.ignore_non_200: return None @@ -73,8 +74,8 @@ def get_base_response(self, relative_url): class HtmlServer(Server): - def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None): - Server.__init__(self, base_url, ignore_non_200, protocol, auth) + def __init__(*args, **kwargs): + Server.__init__(*args, **kwargs) @staticmethod def compare_page(relative_url, servers, selectors): @@ -149,8 +150,8 @@ def get_response(self, relative_url): class JsonServer(Server): - def __init__(self, base_url, ignore_non_200=False, protocol='http', auth=None): - Server.__init__(self, base_url, ignore_non_200, protocol, auth) + def __init__(*args, **kwargs): + Server.__init__(*args, **kwargs) @staticmethod def compare_page(relative_url, servers, keys=None): From 7e4d017034e34df220ab0c7bf25a0aa3a3560778 Mon Sep 17 00:00:00 2001 From: Ryan Kirkman Date: Fri, 4 Dec 2015 00:20:09 -0800 Subject: [PATCH 4/4] Implemented gnarly include_keys and ignore_keys Fixed lint errors --- config_schema.json | 3 ++- htmldiff2.py | 63 ++++++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/config_schema.json b/config_schema.json index f1c2554..7e92e8c 100644 --- a/config_schema.json +++ b/config_schema.json @@ -25,7 +25,8 @@ "uniqueItems": true }, "selectors": {"type": "object"}, - "keys": {"type": "array"} + "ignore_keys": {"type": "array"}, + "include_keys": {"type": "array"} }, "required": ["servers", "relative_urls"] } diff --git a/htmldiff2.py b/htmldiff2.py index 1beb8d4..23a9c50 100755 --- a/htmldiff2.py +++ b/htmldiff2.py @@ -28,7 +28,14 @@ def __str__(self): @staticmethod def compare_pages( - relative_urls, servers, html, json, ignore_non_200=False, threads=1, debug=False, **kwargs): + relative_urls, + servers, + html, + json, + ignore_non_200=False, + threads=1, + debug=False, + **kwargs): """ relative_urls: list of str URLs servers: list of Server objects @@ -92,13 +99,15 @@ def compare_page(relative_url, servers, selectors): trees[server.get_full_url(relative_url)] = response for selector_name, selector in selectors.iteritems(): - results = [HtmlServer.get_text_from_tree(tree, selector) for _, tree in trees.iteritems()] + results = [ + HtmlServer.get_text_from_tree(tree, selector) for _, tree in trees.iteritems()] # If all results are equal, if we construct a set from the results, # the length of the set should be 1 if len(set(results)) != 1: differences.append( - HtmlServer.mismatched_error_message(relative_url, selector_name, selector, trees, results)) + HtmlServer.mismatched_error_message( + relative_url, selector_name, selector, trees, results)) return differences @@ -154,7 +163,7 @@ def __init__(*args, **kwargs): Server.__init__(*args, **kwargs) @staticmethod - def compare_page(relative_url, servers, keys=None): + def compare_page(relative_url, servers, ignore_keys=None, include_keys=None): differences = [] server_responses = OrderedDict() @@ -170,7 +179,7 @@ def compare_page(relative_url, servers, keys=None): results = [] for _, response in server_responses.iteritems(): results.append(json.dumps( - JsonServer.pluck(response, keys), + JsonServer.pluck(response, ignore_keys, include_keys), sort_keys=True, indent=4)) @@ -183,23 +192,32 @@ def compare_page(relative_url, servers, keys=None): return differences @staticmethod - def pluck(json_obj, keys=None): - if not keys or not isinstance(keys, list): + def pluck(json_obj, ignore_keys=None, include_keys=None): + if not ignore_keys and not include_keys: return json_obj - - plucked = {} - for key in keys: - split = key.split('.') - temp_obj = json_obj - for i in xrange(len(split)): - temp = temp_obj.get(split[i]) - if temp: - temp_obj = temp - else: - break - plucked[key] = temp_obj - - return plucked + elif ignore_keys: + for key in ignore_keys: + split = key.split('.') + temp_obj = json_obj + for i in xrange(len(split) - 1): + temp_obj = temp_obj.get(split[i]) + del temp_obj[split[-1]] + elif include_keys: + plucked = {} + for key in include_keys: + split = key.split('.') + temp_obj = json_obj + for i in xrange(len(split)): + temp = temp_obj.get(split[i]) + if temp: + temp_obj = temp + else: + break + plucked[key] = temp_obj + + return plucked + + return json_obj @staticmethod def mismatched_error_message(relative_url, results): @@ -232,7 +250,8 @@ def parse_args(): parser.add_argument("--show-config-format", help="show the config format", action="store_true") parser.add_argument("-t", "--threads", type=int, default=1, help="set the number of threads") parser.add_argument("--debug", help="disable threading for debug purposes", action="store_true") - parser.add_argument("--ignore-non-200", help="ignore responses that aren't 200 OK", action="store_true") + parser.add_argument( + "--ignore-non-200", help="ignore responses that aren't 200 OK", action="store_true") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--html", help="Parse responses as HTML", action="store_true") group.add_argument("--json", help="Parse responses as JSON", action="store_true")