softdevteam · Eddy114514 · Oct 29, 2025 · Nov 3, 2025 · Nov 6, 2025 · Nov 11, 2025
diff --git a/Include/cpython/bytesobject.h b/Include/cpython/bytesobject.h
@@ -1,10 +1,13 @@
 #ifndef Py_CPYTHON_BYTESOBJECT_H
 #  error "this header file must not be included directly"
 #endif
-
+#define BSTATE_NOT_SURE 0
+#define BSTATE_BYTE 1
+#define BSTATE_UNICODE 2
 typedef struct {
     PyObject_VAR_HEAD
     Py_DEPRECATED(3.11) Py_hash_t ob_shash;
+    unsigned int bstate;
     char ob_sval[1];
 
     /* Invariants:
@@ -14,6 +17,13 @@ typedef struct {
      */
 } PyBytesObject;
 
+#define PyBytes_GET_BSTATE(op)       (((PyBytesObject *)(op))->bstate)
+#define PyBytes_SET_BSTATE(op, val)  (((PyBytesObject *)(op))->bstate = (unsigned int)(val))
+
+#define PG_BSTATE_LOAD_BYTES(op_) \
+    (((op_) == NULL) ? BSTATE_NOT_SURE : \
+     PG_BSTATE_NORMALIZE(PyBytes_GET_BSTATE((op_))))
+
 PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
 PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
     const char *format,

diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h
@@ -135,10 +135,15 @@ typedef struct {
         unsigned int ascii:1;
         /* Padding to ensure that PyUnicode_DATA() is always aligned to
            4 bytes (see issue #19537 on m68k). */
-        unsigned int :25;
+        unsigned int bstate:4;
+        unsigned int :21;
     } state;
 } PyASCIIObject;
 
+#define BSTATE_NOT_SURE 0
+#define BSTATE_BYTE 1
+#define BSTATE_UNICODE 2
+
 /* Non-ASCII strings allocated through PyUnicode_New use the
    PyCompactUnicodeObject structure. state.compact is set, and the data
    immediately follow the structure. */
@@ -160,6 +165,51 @@ typedef struct {
     } data;                     /* Canonical, smallest-form Unicode buffer */
 } PyUnicodeObject;
 
+/* Macros for accessing Pygrate bstate */
+#define PyUnicode_GET_BSTATE(op) \
+    (PyUnicode_IS_COMPACT(op) ? \
+        ((PyCompactUnicodeObject *)(op))->_base.state.bstate : \
+        ((PyUnicodeObject *)(op))->_base._base.state.bstate)
+
+#define PyUnicode_SET_BSTATE(op, val) \
+    do { \
+        if (PyUnicode_IS_COMPACT(op)) \
+            ((PyCompactUnicodeObject *)(op))->_base.state.bstate = (val); \
+        else \
+            ((PyUnicodeObject *)(op))->_base._base.state.bstate = (val); \
+    } while (0)
+
+/*
+unsure + unsure = unsure
+unsure + unicode = unicode
+unsure + byte = byte
+
+unicode + unsure = unicode
+unicode + byte = unicode
+unicode + unicode = unicode
+
+byte + unsure = byte
+byte + unicode = byte
+byte + byte = byte
+
+=> if a = unsure then depend on b, 
+   else depend on a
+
+*/
+
+#define PG_BSTATE_MERGE(a, b) \
+    (((a) == BSTATE_NOT_SURE) ? (b) : (a))
+
+#define PG_BSTATE_IS_VALID(s_) \
+    ((s_) == BSTATE_NOT_SURE || (s_) == BSTATE_BYTE || (s_) == BSTATE_UNICODE)
+
+#define PG_BSTATE_NORMALIZE(s_) \
+    (PG_BSTATE_IS_VALID((s_)) ? (s_) : BSTATE_NOT_SURE)
+
+#define PG_BSTATE_LOAD_UNICODE(op_) \
+    (((op_) == NULL) ? BSTATE_NOT_SURE : \
+     PG_BSTATE_NORMALIZE(PyUnicode_GET_BSTATE((op_))))
+
 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
     PyObject *op,
     int check_content);

diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
@@ -177,6 +177,7 @@ _Py_COMP_DIAG_IGNORE_DEPR_DECLS
     op->ob_shash = -1;
 _Py_COMP_DIAG_POP
     memcpy(op->ob_sval, str, size+1);
+    PyBytes_SET_BSTATE(op, BSTATE_NOT_SURE);
     return (PyObject *) op;
 }
 

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -1209,6 +1209,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
     unicode_fill_invalid((PyObject*)unicode, 0);
 #endif
     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
+    PyUnicode_SET_BSTATE(obj, BSTATE_UNICODE);
     return obj;
 }
 
@@ -10681,42 +10682,84 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
     if (ensure_unicode(left) < 0)
         return NULL;
 
+    PyObject *right_u = right;
+    int need_decref_right_u = 0;
+
+    int lb = PG_BSTATE_LOAD_UNICODE(left);
+    int rb = BSTATE_NOT_SURE;
+
     if (!PyUnicode_Check(right)) {
-        PyErr_Format(PyExc_TypeError,
+        if (Py_Py2xWarningFlag && PyBytes_Check(right)) {
+            const char *buf = PyBytes_AS_STRING(right);
+            Py_ssize_t n = PyBytes_GET_SIZE(right);
+            right_u = PyUnicode_DecodeLatin1(buf, n, NULL);
+            if (right_u == NULL)
+                return NULL;
+            need_decref_right_u = 1;
+            rb = BSTATE_BYTE;
+            PyBytes_SET_BSTATE(right, BSTATE_BYTE);
+        }
+        else {
+            PyErr_Format(PyExc_TypeError,
                      "can only concatenate str (not \"%.200s\") to str",
                      Py_TYPE(right)->tp_name);
-        return NULL;
+            return NULL;
+        }
+    }
+    else{
+        if(Py_Py2xWarningFlag){
+                rb = PG_BSTATE_LOAD_BYTES(right);
+            }
     }
 
     /* Shortcuts */
     PyObject *empty = unicode_get_empty();  // Borrowed reference
     if (left == empty) {
-        return PyUnicode_FromObject(right);
+        PyObject *r = PyUnicode_FromObject(right_u);
+        if (need_decref_right_u) Py_DECREF(right_u);
+        return r;
     }
-    if (right == empty) {
-        return PyUnicode_FromObject(left);
+    if (right_u == empty) {
+        PyObject *r = PyUnicode_FromObject(left);
+        if (need_decref_right_u) Py_DECREF(right_u);
+        return r;
     }
 
     left_len = PyUnicode_GET_LENGTH(left);
-    right_len = PyUnicode_GET_LENGTH(right);
+    right_len = PyUnicode_GET_LENGTH(right_u);
     if (left_len > PY_SSIZE_T_MAX - right_len) {
-        PyErr_SetString(PyExc_OverflowError,
-                        "strings are too large to concat");
+        if (need_decref_right_u) Py_DECREF(right_u);
+        PyErr_SetString(PyExc_OverflowError, "strings are too large to concat");
         return NULL;
     }
     new_len = left_len + right_len;
 
     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
-    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
+    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right_u);
     maxchar = Py_MAX(maxchar, maxchar2);
 
     /* Concat the two Unicode strings */
     result = PyUnicode_New(new_len, maxchar);
-    if (result == NULL)
+    if (result == NULL) {
+        if (need_decref_right_u) Py_DECREF(right_u);
         return NULL;
+    }
     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
-    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
+    _PyUnicode_FastCopyCharacters(result, left_len, right_u, 0, right_len);
     assert(_PyUnicode_CheckConsistency(result, 1));
+
+    if(Py_Py2xWarningFlag){
+        PyUnicode_SET_BSTATE(result, PG_BSTATE_MERGE(lb, rb));
+        if(lb != BSTATE_NOT_SURE && rb != BSTATE_NOT_SURE && lb != rb){
+            const char *lhs = (lb == BSTATE_UNICODE) ? "unicode" : "byte";
+            const char *rhs = (rb == BSTATE_UNICODE) ? "unicode" : "byte";
+            PyErr_WarnFormat(PyExc_Py2xWarning, 1,
+               "implicit %s + %s concatenation; Python 3 would raise TypeError",
+               lhs, rhs);
+        }
+    }
+
+    if (need_decref_right_u) Py_DECREF(right_u);
     return result;
 }