-
Notifications
You must be signed in to change notification settings - Fork 4
Implemented b-state for unicodeobject #37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: migration
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -135,10 +135,15 @@ typedef struct { | |
| unsigned int ascii:1; | ||
| /* Padding to ensure that PyUnicode_DATA() is always aligned to | ||
| 4 bytes (see issue #19537 on m68k). */ | ||
| unsigned int :25; | ||
| unsigned int bstate:4; | ||
| unsigned int :21; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line has now been divorced from its doc string, so the "always aligned to 4 bytes" no longer makes sense. That means that |
||
| } state; | ||
| } PyASCIIObject; | ||
|
|
||
| #define BSTATE_NOT_SURE 0 | ||
| #define BSTATE_BYTE 1 | ||
| #define BSTATE_UNICODE 2 | ||
|
|
||
| /* Non-ASCII strings allocated through PyUnicode_New use the | ||
| PyCompactUnicodeObject structure. state.compact is set, and the data | ||
| immediately follow the structure. */ | ||
|
|
@@ -160,6 +165,51 @@ typedef struct { | |
| } data; /* Canonical, smallest-form Unicode buffer */ | ||
| } PyUnicodeObject; | ||
|
|
||
| /* Macros for accessing Pygrate bstate */ | ||
| #define PyUnicode_GET_BSTATE(op) \ | ||
| (PyUnicode_IS_COMPACT(op) ? \ | ||
| ((PyCompactUnicodeObject *)(op))->_base.state.bstate : \ | ||
| ((PyUnicodeObject *)(op))->_base._base.state.bstate) | ||
|
|
||
| #define PyUnicode_SET_BSTATE(op, val) \ | ||
| do { \ | ||
| if (PyUnicode_IS_COMPACT(op)) \ | ||
| ((PyCompactUnicodeObject *)(op))->_base.state.bstate = (val); \ | ||
| else \ | ||
| ((PyUnicodeObject *)(op))->_base._base.state.bstate = (val); \ | ||
| } while (0) | ||
|
|
||
| /* | ||
| unsure + unsure = unsure | ||
| unsure + unicode = unicode | ||
| unsure + byte = byte | ||
|
|
||
| unicode + unsure = unicode | ||
| unicode + byte = unicode | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, so this is I think the big thing we need to consider (alongside We then have three choices: (1) we arbitrarily pick a winner (which is what the above logic does) (2) we revert back to "unsure" or (3) we have an additional "mixed" kind. I can see advantages to all three possibilities but it's hard to know which might be best. My probable criteria would be "which causes the fewest spurious warnings after the first warning?" I suspect we actually need to try this on some real code to know. |
||
| unicode + unicode = unicode | ||
|
|
||
| byte + unsure = byte | ||
| byte + unicode = byte | ||
| byte + byte = byte | ||
|
|
||
| => if a = unsure then depend on b, | ||
| else depend on a | ||
|
|
||
| */ | ||
|
|
||
| #define PG_BSTATE_MERGE(a, b) \ | ||
| (((a) == BSTATE_NOT_SURE) ? (b) : (a)) | ||
|
|
||
| #define PG_BSTATE_IS_VALID(s_) \ | ||
| ((s_) == BSTATE_NOT_SURE || (s_) == BSTATE_BYTE || (s_) == BSTATE_UNICODE) | ||
|
|
||
| #define PG_BSTATE_NORMALIZE(s_) \ | ||
| (PG_BSTATE_IS_VALID((s_)) ? (s_) : BSTATE_NOT_SURE) | ||
|
|
||
| #define PG_BSTATE_LOAD_UNICODE(op_) \ | ||
| (((op_) == NULL) ? BSTATE_NOT_SURE : \ | ||
| PG_BSTATE_NORMALIZE(PyUnicode_GET_BSTATE((op_)))) | ||
|
|
||
| PyAPI_FUNC(int) _PyUnicode_CheckConsistency( | ||
| PyObject *op, | ||
| int check_content); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1209,6 +1209,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) | |
| unicode_fill_invalid((PyObject*)unicode, 0); | ||
| #endif | ||
| assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); | ||
| PyUnicode_SET_BSTATE(obj, BSTATE_UNICODE); | ||
| return obj; | ||
| } | ||
|
|
||
|
|
@@ -10681,42 +10682,84 @@ PyUnicode_Concat(PyObject *left, PyObject *right) | |
| if (ensure_unicode(left) < 0) | ||
| return NULL; | ||
|
|
||
| PyObject *right_u = right; | ||
| int need_decref_right_u = 0; | ||
|
|
||
| int lb = PG_BSTATE_LOAD_UNICODE(left); | ||
| int rb = BSTATE_NOT_SURE; | ||
|
|
||
| if (!PyUnicode_Check(right)) { | ||
| PyErr_Format(PyExc_TypeError, | ||
| if (Py_Py2xWarningFlag && PyBytes_Check(right)) { | ||
| const char *buf = PyBytes_AS_STRING(right); | ||
| Py_ssize_t n = PyBytes_GET_SIZE(right); | ||
| right_u = PyUnicode_DecodeLatin1(buf, n, NULL); | ||
| if (right_u == NULL) | ||
| return NULL; | ||
| need_decref_right_u = 1; | ||
| rb = BSTATE_BYTE; | ||
| PyBytes_SET_BSTATE(right, BSTATE_BYTE); | ||
| } | ||
| else { | ||
| PyErr_Format(PyExc_TypeError, | ||
| "can only concatenate str (not \"%.200s\") to str", | ||
| Py_TYPE(right)->tp_name); | ||
| return NULL; | ||
| return NULL; | ||
| } | ||
| } | ||
| else{ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Formatting has gone wonky here. |
||
| if(Py_Py2xWarningFlag){ | ||
| rb = PG_BSTATE_LOAD_BYTES(right); | ||
| } | ||
| } | ||
|
|
||
| /* Shortcuts */ | ||
| PyObject *empty = unicode_get_empty(); // Borrowed reference | ||
| if (left == empty) { | ||
| return PyUnicode_FromObject(right); | ||
| PyObject *r = PyUnicode_FromObject(right_u); | ||
| if (need_decref_right_u) Py_DECREF(right_u); | ||
| return r; | ||
| } | ||
| if (right == empty) { | ||
| return PyUnicode_FromObject(left); | ||
| if (right_u == empty) { | ||
| PyObject *r = PyUnicode_FromObject(left); | ||
| if (need_decref_right_u) Py_DECREF(right_u); | ||
| return r; | ||
| } | ||
|
|
||
| left_len = PyUnicode_GET_LENGTH(left); | ||
| right_len = PyUnicode_GET_LENGTH(right); | ||
| right_len = PyUnicode_GET_LENGTH(right_u); | ||
| if (left_len > PY_SSIZE_T_MAX - right_len) { | ||
| PyErr_SetString(PyExc_OverflowError, | ||
| "strings are too large to concat"); | ||
| if (need_decref_right_u) Py_DECREF(right_u); | ||
| PyErr_SetString(PyExc_OverflowError, "strings are too large to concat"); | ||
| return NULL; | ||
| } | ||
| new_len = left_len + right_len; | ||
|
|
||
| maxchar = PyUnicode_MAX_CHAR_VALUE(left); | ||
| maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); | ||
| maxchar2 = PyUnicode_MAX_CHAR_VALUE(right_u); | ||
| maxchar = Py_MAX(maxchar, maxchar2); | ||
|
|
||
| /* Concat the two Unicode strings */ | ||
| result = PyUnicode_New(new_len, maxchar); | ||
| if (result == NULL) | ||
| if (result == NULL) { | ||
| if (need_decref_right_u) Py_DECREF(right_u); | ||
| return NULL; | ||
| } | ||
| _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len); | ||
| _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len); | ||
| _PyUnicode_FastCopyCharacters(result, left_len, right_u, 0, right_len); | ||
| assert(_PyUnicode_CheckConsistency(result, 1)); | ||
|
|
||
| if(Py_Py2xWarningFlag){ | ||
| PyUnicode_SET_BSTATE(result, PG_BSTATE_MERGE(lb, rb)); | ||
| if(lb != BSTATE_NOT_SURE && rb != BSTATE_NOT_SURE && lb != rb){ | ||
| const char *lhs = (lb == BSTATE_UNICODE) ? "unicode" : "byte"; | ||
| const char *rhs = (rb == BSTATE_UNICODE) ? "unicode" : "byte"; | ||
| PyErr_WarnFormat(PyExc_Py2xWarning, 1, | ||
| "implicit %s + %s concatenation; Python 3 would raise TypeError", | ||
| lhs, rhs); | ||
| } | ||
| } | ||
|
|
||
| if (need_decref_right_u) Py_DECREF(right_u); | ||
| return result; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not really sure what the intention of this macro is, and the one call site doesn't immediately make it obvious to me. Can we provide a quick doc string?