Hi Michael,
Attached is the acctst.c regression test I added, based on your test code.
I have pushed this change back to acctst.c in Launchpad.
I have also written a fix which is passing regression tests. I will probably also push that up in the morning if I don’t see any issues when I review it with fresh eyes.
Jeff
#define multiSubscrDestroyNoLateCallbackEventCount 500
struct MultiSubscrDestroyNoLateCallbackEventData {
evid m_id;
size_t m_nCallback;
int m_callbackIsOk;
struct MultiSubscrDestroyNoLateCallbackTestData * m_pTestData;
};
struct MultiSubscrDestroyNoLateCallbackTestData {
const char * m_pChanName;
chid m_chan;
epicsMutexId m_mutex;
epicsEventId m_testDoneEvent;
unsigned m_interestLevel;
struct MultiSubscrDestroyNoLateCallbackEventData
m_eventData [multiSubscrDestroyNoLateCallbackEventCount];
};
static void noLateCallbackDetect ( struct event_handler_args args )
{
int callbackIsOk;
struct MultiSubscrDestroyNoLateCallbackEventData * const pEventData = args.usr;
epicsMutexLockStatus lockStatus = epicsMutexLock ( pEventData->m_pTestData->m_mutex );
callbackIsOk = pEventData->m_callbackIsOk;
pEventData->m_nCallback++;
epicsMutexUnlock ( pEventData->m_pTestData->m_mutex );
verify ( lockStatus == epicsMutexLockOK );
verify ( callbackIsOk );
}
static void multiSubscrDestroyNoLateCallbackThread ( void * pParm )
{
struct MultiSubscrDestroyNoLateCallbackTestData * const pTestData =
( struct MultiSubscrDestroyNoLateCallbackTestData * ) pParm;
unsigned i, j;
int status;
status = ca_context_create ( ca_enable_preemptive_callback );
verify ( status == ECA_NORMAL );
status = ca_create_channel ( pTestData->m_pChanName, 0, 0,
CA_PRIORITY_DEFAULT, &pTestData->m_chan );
status = ca_pend_io ( timeoutToPendIO );
SEVCHK ( status, "multiSubscrDestroyLateNoCallbackTest: channel connect failed" );
verify ( status == ECA_NORMAL );
/*
* create a set of subscriptions
*/
for ( i=0; i < 10000; i++ ) {
unsigned int priorityOfTestThread;
for ( j=0; j < multiSubscrDestroyNoLateCallbackEventCount; j++ ) {
epicsMutexLockStatus lockStatus = epicsMutexLock ( pTestData->m_mutex );
verify ( lockStatus == epicsMutexLockOK );
pTestData->m_eventData[j].m_nCallback = 0;
pTestData->m_eventData[j].m_callbackIsOk = TRUE;
pTestData->m_eventData[j].m_pTestData = pTestData;
epicsMutexUnlock ( pTestData->m_mutex );
SEVCHK ( ca_add_event ( DBR_GR_FLOAT, pTestData->m_chan, noLateCallbackDetect,
&pTestData->m_eventData[j], &pTestData->m_eventData[j].m_id ) , NULL );
}
SEVCHK ( ca_flush_io(), NULL );
/*
* raise the priority of the current thread hoping to improve our
* likelyhood of detecting a bug
*/
priorityOfTestThread = epicsThreadGetPrioritySelf ();
epicsThreadSetPriority ( epicsThreadGetIdSelf(), epicsThreadPriorityHigh );
/*
* wait for the first subscription update to arrive
*/
{
epicsMutexLockStatus lockStatus = epicsMutexLock ( pTestData->m_mutex );
verify ( lockStatus == epicsMutexLockOK );
while ( pTestData->m_eventData[0].m_nCallback == 0 ) {
epicsMutexUnlock ( pTestData->m_mutex );
epicsThreadSleep ( 50e-6 );
lockStatus = epicsMutexLock ( pTestData->m_mutex );
verify ( lockStatus == epicsMutexLockOK );
}
epicsMutexUnlock ( pTestData->m_mutex );
}
/*
* try to destroy all of the subscriptions at precisely the same time that
* their first callbacks are running
*/
for ( j=0; j < multiSubscrDestroyNoLateCallbackEventCount; j++ ) {
SEVCHK ( ca_clear_event ( pTestData->m_eventData[j].m_id ) , NULL );
epicsMutexLockStatus lockStatus = epicsMutexLock ( pTestData->m_mutex );
verify ( lockStatus == epicsMutexLockOK );
pTestData->m_eventData[j].m_callbackIsOk = FALSE;
epicsMutexUnlock ( pTestData->m_mutex );
}
/*
* return to the original priority
*/
epicsThreadSetPriority ( epicsThreadGetIdSelf(), priorityOfTestThread );
if ( i % 1000 == 0 ) {
showProgress ( pTestData->m_interestLevel );
}
}
SEVCHK ( ca_clear_channel ( pTestData->m_chan ), NULL );
ca_context_destroy ();
epicsEventMustTrigger ( pTestData->m_testDoneEvent );
}
/*
* verify that, in a preemtive callback mode client, a subscription callback never
* comes after the subscription is destroyed
*/
static void multiSubscrDestroyNoLateCallbackTest ( const char *pName, unsigned interestLevel )
{
struct MultiSubscrDestroyNoLateCallbackTestData * pTestData;
showProgressBegin ( "multiSubscrDestroyNoLateCallbackTest", interestLevel );
pTestData = calloc ( 1u, sizeof ( struct MultiSubscrDestroyNoLateCallbackTestData ) );
verify ( pTestData );
pTestData->m_mutex = epicsMutexMustCreate ();
pTestData->m_testDoneEvent = epicsEventMustCreate ( epicsEventEmpty );
pTestData->m_pChanName = pName;
pTestData->m_interestLevel = interestLevel;
epicsThreadMustCreate (
"multiSubscrDestroyNoLateCallbackTest",
epicsThreadPriorityLow,
epicsThreadGetStackSize ( epicsThreadStackMedium ),
multiSubscrDestroyNoLateCallbackThread,
pTestData );
/*
* wait for test to complete
*/
epicsEventMustWait ( pTestData->m_testDoneEvent );
/*
* cleanup
*/
epicsMutexDestroy ( pTestData->m_mutex );
epicsEventDestroy ( pTestData->m_testDoneEvent );
free ( pTestData );
showProgressEnd ( interestLevel );
}
> -----Original Message-----
> From: [email protected] [mailto:[email protected]] On Behalf
> Of Michael Abbott
> Sent: Tuesday, May 14, 2013 12:25 AM
> To: Hill, Jeff
> Subject: [Bug 1179642] Re: race condition when destroying subscripion in
> preemptive callback mode application
>
> Here's the attachment from the original e-mail.
>
> ** Attachment added: "Test IOC to demonstrate race condition reported
> here"
> https://bugs.launchpad.net/epics-
> base/+bug/1179642/+attachment/3675984/+files/test.tgz
>
> --
> You received this bug notification because you are subscribed to the bug
> report.
> https://bugs.launchpad.net/bugs/1179642
>
> Title:
> race condition when destroying subscripion in preemptive callback mode
> application
>
> Status in EPICS Base:
> Confirmed
>
> Bug description:
> From Michael Abbot
>
> see http://www.aps.anl.gov/epics/tech-talk/2012/msg00584.php
>
> > > In the attached test IOC I repeatedly create 500 subscriptions to 500
> > > locally published PVs, pause a few hundred microseconds, and then
> > > proceed to tear them all down again. The context pointer I pass
> > > (args.usr) just contains a validity flag which I reset after
> > > ca_clear_subscription returns -- and which I test in the callback.
> > >
> > > Below is a typical run:
> > >
> > > $ ./test 10 500
> > > dbLoadDatabase("dbd/TEST.dbd", NULL, NULL)
> > > TEST_registerRecordDeviceDriver(pdbbase)
> > > dbLoadRecords("db/TEST.db", NULL)
> > > iocInit()
> > > Starting iocInit
> > >
> >
> ###############################################################
> > ########
> > > #####
> > > ## EPICS R3.14.11 $R3-14-11$ $2009/08/28 18:47:36$
> > > ## EPICS Base built Nov 4 2011
> > >
> >
> ###############################################################
> > ########
> > > #####
> > > iocRun: All initialization complete
> > > All channels connected
> > > Testing 10 cycles, interval 500 us
> > > [......................................................................
> > > .......................................................................
> > > .......................................................................
> > > .......................................................................
> > > .......................................................................
> > > .......................................................................
> > > ...............................................................whoops!
> > > ][
> > >
> > >
> > > The two arguments to `test` are number of times to try and how long to
> > > pause between create and clear (in microseconds, passed to usleep(3)).
> > > [ and ] are printed at the start and end of a cycle (so [ is
> > > immediately followed by a burst of ca_create_subscription() calls) and
> > > each . represents a successful callback. An unsuccessful (invalid)
> > > callback is shown by 'whoops!' which is followed by an exit() call.
> > >
> > > This test can be very delicate and difficult to reproduce, and may need
> > > to be run many times with slightly different pause intervals before
> > > being even partially repeatable -- the fault only appears to show when
> > > there isn't time for all 500 PVs to complete their initial updates, but
> > > there has to be enough time for them all to make the effort.
> > >
> > > Another interesting detail follows from some locking I'm doing. Here
> > > is an extract of the relevant code (LOCK() is just
> > > pthread_mutex_lock(3p) on a global mutex):
> > >
> > > 1 static void on_update(struct event_handler_args)
> > > 2 {
> > > 3 struct event *event = args.usr;
> > > 4 LOCK();
> > > 5 bool valid = event->valid;
> > > 6 UNLOCK();
> > > 7 if (valid) ...
> > > 8 }
> > >
> > > ...
> > >
> > > 9 LOCK(); // This should trigger deadlock
> > > 10 ca_clear_subscription(event->event_id);
> > > 11 event->valid = false;
> > > 12 UNLOCK();
> > >
> > > It seems to me that if ca_clear_subscription() is correctly doing what
> > > we discussed a year ago, which is to say, if it is waiting for all
> > > outstanding callbacks to complete before returning, then the LOCK() on
> > > line 9 should trigger a deadlock when ca_clear_subscription() is called
> > > with its associated callback still only on line 3 (or earlier). But I
> > > never see my test deadlock.
> > >
> > > I'm seeing this problem occur on test code which is repeatedly creating
> > > and destroying subscriptions, but I've previously reported this on CA
> > > client shutdown, so it does look to me like there is a general
> > > synchronisation problem here. I believe I have a workaround, which is
> > > to delay releasing the callback context to give time for outstanding
> > > callbacks to complete, but this is a bit worrysome...
> >
> >
>
> To manage notifications about this bug go to:
> https://bugs.launchpad.net/epics-base/+bug/1179642/+subscriptions
- Navigate by Date:
- Prev:
RES: Open multiple OPI Lucas Jose Monteiro Carvalho
- Next:
Re: How to output arrary data in EPICS db? Dirk Zimoch
- Index:
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
<2013>
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
- Navigate by Thread:
- Prev:
RES: Open multiple OPI Lucas Jose Monteiro Carvalho
- Next:
bug in xxx_registerRecordDeviceDriver.cpp on Windows Dirk Zimoch
- Index:
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
<2013>
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
|